VectorScalar.c - UnaryOpScalarArray(arrayGeneralAbs,fabs,ty…

/cbits/VectorScalar.c

https://bitbucket.org/haskellnumerics/vector-vectorized · C · 261 lines · 68 code · 42 blank · 151 comment · 3 complexity · 2b7e77145ce20b7e0e149d177978b8c9 MD5 · raw file

#include "simd.h"
#include <tgmath.h>
// #include <complex.h>



/*
note: using 32bit signed 
*/

/*
we have 3 versions: 
avx2(untested)
sse4/avx1 (and i guess sse2 by accident there too, well
explicit dot product vs not, so some complation)
plus scalar fallback

*/

/*
+ - * , abs,

note, for now using 32bit int types for array operation sizes and strides 
because you really shouldn't do more than 4gb of work in one sequential ffi call!




*/

/* CPP macro to generate declarations and body for strided scalar versions */

// UnaryOpScalarArray(arrayGeneralLog)
// UnaryOpScalarArray(arrayGeneralAbs,fabs,type) \ // this is wrong for complex numbers

/*



double               carg(double complex);

double               cimag(double complex);




double               creal(double complex);

double complex       cacos(double complex);

double complex       cacosh(double complex);


double complex       casin(double complex);

double complex       casinh(double complex);


double complex       catan(double complex);

double complex       catanh(double complex);


double complex       ccos(double complex);

double complex       ccosh(double complex);

double complex       cexp(double complex);

double complex       clog(double complex);

double complex       conj(double complex);



double complex       cproj(double complex);

double complex       csin(double complex);

double complex       csinh(double complex);

double complex       csqrt(double complex);

double complex       ctan(double complex);

double complex       ctanh(double complex);


double complex       cpow(double complex, double complex);
*/

/*



atan2()
cbrt()
ceil()
copysign()
erf()
erfc()
exp2()
expm1()
fdim()
floor()
 


fma()
fmax()
fmin()
fmod()
frexp()
hypot()
ilogb()
ldexp()
lgamma()
llrint()
 


llround()
log10()
log1p()
log2()
logb()
lrint()
lround()
nearbyint()
nextafter()
nexttoward()
 


remainder()
remquo()
rint()
round()
scalbn()
scalbln()
tgamma()
trunc()
 */



#define BinaryOpScalarArray(name,binaryop,type) void  name##_##type(int32_t length, type  *   left, \
int32_t leftStride ,type  *   right,int32_t rightStride, type *   result, \
int resultStride  ); \
 \
void name##_##type(int32_t length, type  *   left,int32_t leftStride ,type  *   right,int32_t rightStride, type *   result, int32_t resultStride  ){ \
    int32_t ix = 0 ;  \
    for (ix = 0; ix < length ; ix ++){ \
        result[ix* resultStride]= (left[ix*leftStride] ) binaryop (right[ix*rightStride]  ) ; \
        }  \
}

#define UnaryOpScalarArray(name,op,type) void name##_##type(int32_t length, type *  in,int32_t inStride,\
type *  out, int32_t outStride); \
 \
void name##_##type(int32_t length, type *  in,int32_t inStride, type *  out, \
int32_t outStride){ \
    int32_t ix = 0 ; \
    for(ix = 0 ; ix < length ; ix ++){ \
        /*  WARNING / NOTE: in the complex float case, for reciprocal unary op, it does the divide using doubles,  then casts back to complex float, so may have  unexpectedly nice precision
        */ \
        out[ix*outStride] = (type) op(in[ix*inStride]); \
    } \
}

#define DotProductScalarArray(name,binaryop, type,init) type name##_##type(int32_t length, type  *   left, \
int32_t leftStride ,type  *   right,int32_t rightStride); \
 \
 type name##_##type(int32_t length, type  *   left, int32_t leftStride ,type  *   right,int32_t rightStride){ \
    type res = init ; \
    int32_t ix = 0; \
    for(ix=0 ; ix < length ; ix++ ){ \
        res +=  binaryop((left[ix*leftStride] ),(right[ix*rightStride]  )) ; \
        } \
    return res ; \
    }



////////////
/// Scalar versions
///////////

/*
for now I shall assume that everything only works on < 4gb / 32 sized ranges


also, will try to write things so that if any of the read (input) arrays
*/

/*
do a typedef for the complex types so that writing the macro stuff 
mixes well
*/


#define negate(numexp)  (-(numexp))
#define reciprocal(numexp) (1.0/(numexp))


#define mkNumFracOpsScalar(type)  \
BinaryOpScalarArray(arrayPlus,+,type) \
BinaryOpScalarArray(arrayMinus,-,type) \
BinaryOpScalarArray(arrayTimes,*,type) \
BinaryOpScalarArray(arrayDivide,/,type) \
UnaryOpScalarArray(arrayNegate,negate,type) \
UnaryOpScalarArray(arrayReciprocal,reciprocal,type) \
UnaryOpScalarArray(arraySqrt,sqrt,type) 


#define realtimes(x,y) (x * y ) 

#define complextimes(x,y)  (x * conj(y))

typedef float complex complex_float ;
typedef double complex complex_double ;

DotProductScalarArray(arrayDotProduct,realtimes,double,0.0) 
DotProductScalarArray(arrayDotProduct,realtimes,float,0.0) 

// name mangling the complex dot products because I need to 
//  wrap them to take a singlen array pointer where I write the result
// because haskell currently doesn't have an FFI story for structs and complex numbers
DotProductScalarArray(arrayDotProduct_internal,complextimes,complex_double,0.0 + I*0.0) 
DotProductScalarArray(arrayDotProduct_internal,complextimes,complex_float,0.0f + I*0.0f) 


/// NOTE: complex valued dot products have a different type than the real float dot products
/// this is important to remember!

void arrayDotProduct_complex_double(int32_t length, complex_double  *   left, int32_t leftStride ,complex_double  *   right,int32_t rightStride, complex_double * resultSingleton);

void arrayDotProduct_complex_double(int32_t length, complex_double  *   left, int32_t leftStride ,complex_double  *   right,int32_t rightStride, complex_double * resultSingleton){
    complex_double result = arrayDotProduct_internal_complex_double(length,left,leftStride,right,rightStride);
    resultSingleton[0] = result ; 
}


void arrayDotProduct_complex_float(int32_t length, complex_float  *   left, int32_t leftStride ,complex_float  *   right,int32_t rightStride, complex_float * resultSingleton);

void arrayDotProduct_complex_float(int32_t length, complex_float  *   left, int32_t leftStride ,complex_float  *   right,int32_t rightStride, complex_float * resultSingleton){
    complex_float result = arrayDotProduct_internal_complex_float(length,left,leftStride,right,rightStride);
    resultSingleton[0] = result ; 
}




UnaryOpScalarArray(arrayGeneralAbs,fabs,float)
UnaryOpScalarArray(arrayGeneralAbs,fabs,double)

mkNumFracOpsScalar(complex_double)
mkNumFracOpsScalar(complex_float)
mkNumFracOpsScalar(double) 
mkNumFracOpsScalar(float)