/tests/staggered_invert_test.cpp
C++ | 592 lines | 457 code | 116 blank | 19 comment | 58 complexity | 4c9a5e0fcf0d005499572b332fc8f88a MD5 | raw file
Possible License(s): BSD-3-Clause
- #include <stdlib.h>
- #include <stdio.h>
- #include <time.h>
- #include <math.h>
- #include <test_util.h>
- #include <dslash_util.h>
- #include <blas_reference.h>
- #include <staggered_dslash_reference.h>
- #include <quda.h>
- #include <string.h>
- #include <face_quda.h>
- #include "misc.h"
- #include <gauge_field.h>
- #include <blas_quda.h>
- #if defined(QMP_COMMS)
- #include <qmp.h>
- #elif defined(MPI_COMMS)
- #include <mpi.h>
- #endif
- #ifdef MULTI_GPU
- #include <face_quda.h>
- #endif
- #define MAX(a,b) ((a)>(b)?(a):(b))
- #define mySpinorSiteSize 6
- extern void usage(char** argv);
- void *qdp_fatlink[4];
- void *qdp_longlink[4];
- void *fatlink;
- void *longlink;
- #ifdef MULTI_GPU
- void** ghost_fatlink, **ghost_longlink;
- #endif
- extern int device;
- extern bool tune;
- extern QudaReconstructType link_recon;
- extern QudaPrecision prec;
- QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION;
- extern QudaReconstructType link_recon_sloppy;
- extern QudaPrecision prec_sloppy;
- cpuColorSpinorField* in;
- cpuColorSpinorField* out;
- cpuColorSpinorField* ref;
- cpuColorSpinorField* tmp;
- cpuGaugeField *cpuFat = NULL;
- cpuGaugeField *cpuLong = NULL;
- static double tol = 1e-7;
- extern int test_type;
- extern int xdim;
- extern int ydim;
- extern int zdim;
- extern int tdim;
- extern int gridsize_from_cmdline[];
- static void end();
- template<typename Float>
- void constructSpinorField(Float *res) {
- for(int i = 0; i < Vh; i++) {
- for (int s = 0; s < 1; s++) {
- for (int m = 0; m < 3; m++) {
- res[i*(1*3*2) + s*(3*2) + m*(2) + 0] = rand() / (Float)RAND_MAX;
- res[i*(1*3*2) + s*(3*2) + m*(2) + 1] = rand() / (Float)RAND_MAX;
- }
- }
- }
- }
- static void
- set_params(QudaGaugeParam* gaugeParam, QudaInvertParam* inv_param,
- int X1, int X2, int X3, int X4,
- QudaPrecision cpu_prec, QudaPrecision prec, QudaPrecision prec_sloppy,
- QudaReconstructType link_recon, QudaReconstructType link_recon_sloppy,
- double mass, double tol, int maxiter, double reliable_delta,
- double tadpole_coeff
- )
- {
- gaugeParam->X[0] = X1;
- gaugeParam->X[1] = X2;
- gaugeParam->X[2] = X3;
- gaugeParam->X[3] = X4;
- gaugeParam->cpu_prec = cpu_prec;
- gaugeParam->cuda_prec = prec;
- gaugeParam->reconstruct = link_recon;
- gaugeParam->cuda_prec_sloppy = prec_sloppy;
- gaugeParam->reconstruct_sloppy = link_recon_sloppy;
- gaugeParam->gauge_fix = QUDA_GAUGE_FIXED_NO;
- gaugeParam->anisotropy = 1.0;
- gaugeParam->tadpole_coeff = tadpole_coeff;
- gaugeParam->scale = -1.0/(24.0*tadpole_coeff*tadpole_coeff);
- gaugeParam->t_boundary = QUDA_ANTI_PERIODIC_T;
- gaugeParam->gauge_order = QUDA_MILC_GAUGE_ORDER;
- gaugeParam->ga_pad = X1*X2*X3/2;
- inv_param->verbosity = QUDA_VERBOSE;
- inv_param->mass = mass;
- // outer solver parameters
- inv_param->inv_type = QUDA_CG_INVERTER;
- inv_param->tol = tol;
- inv_param->maxiter = 500000;
- inv_param->reliable_delta = 1e-1;
- #if __COMPUTE_CAPABILITY__ >= 200
- // require both L2 relative and heavy quark residual to determine convergence
- inv_param->residual_type = static_cast<QudaResidualType>(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL);
- inv_param->tol_hq = 1e-3; // specify a tolerance for the residual for heavy quark residual
- #else
- // Pre Fermi architecture only supports L2 relative residual norm
- inv_param->residual_type = QUDA_L2_RELATIVE_RESIDUAL;
- #endif
- //inv_param->inv_type = QUDA_GCR_INVERTER;
- //inv_param->gcrNkrylov = 10;
- // domain decomposition preconditioner parameters
- //inv_param->inv_type_precondition = QUDA_MR_INVERTER;
- //inv_param->tol_precondition = 1e-1;
- //inv_param->maxiter_precondition = 100;
- //inv_param->verbosity_precondition = QUDA_SILENT;
- //inv_param->prec_precondition = prec_sloppy;
- inv_param->solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
- inv_param->solve_type = QUDA_NORMOP_PC_SOLVE;
- inv_param->matpc_type = QUDA_MATPC_EVEN_EVEN;
- inv_param->dagger = QUDA_DAG_NO;
- inv_param->mass_normalization = QUDA_MASS_NORMALIZATION;
- inv_param->cpu_prec = cpu_prec;
- inv_param->cuda_prec = prec;
- inv_param->cuda_prec_sloppy = prec_sloppy;
- inv_param->preserve_source = QUDA_PRESERVE_SOURCE_YES;
- inv_param->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // this is meaningless, but must be thus set
- inv_param->dirac_order = QUDA_DIRAC_ORDER;
- inv_param->dslash_type = QUDA_ASQTAD_DSLASH;
- inv_param->tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO;
- inv_param->sp_pad = X1*X2*X3/2;
- inv_param->use_init_guess = QUDA_USE_INIT_GUESS_YES;
- inv_param->input_location = QUDA_CPU_FIELD_LOCATION;
- inv_param->output_location = QUDA_CPU_FIELD_LOCATION;
- }
- int
- invert_test(void)
- {
- QudaGaugeParam gaugeParam = newQudaGaugeParam();
- QudaInvertParam inv_param = newQudaInvertParam();
- double mass = 0.5;
- set_params(&gaugeParam, &inv_param,
- xdim, ydim, zdim, tdim,
- cpu_prec, prec, prec_sloppy,
- link_recon, link_recon_sloppy, mass, tol, 500, 1e-3,
- 0.8);
- // declare the dimensions of the communication grid
- initCommsGridQuda(4, gridsize_from_cmdline, NULL, NULL);
- // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
- initQuda(device);
- setDims(gaugeParam.X);
- setSpinorSiteSize(6);
- size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
- for (int dir = 0; dir < 4; dir++) {
- qdp_fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
- qdp_longlink[dir] = malloc(V*gaugeSiteSize*gSize);
- }
- fatlink = malloc(4*V*gaugeSiteSize*gSize);
- longlink = malloc(4*V*gaugeSiteSize*gSize);
- construct_fat_long_gauge_field(qdp_fatlink, qdp_longlink, 1, gaugeParam.cpu_prec, &gaugeParam);
- const double cos_pi_3 = 0.5; // Cos(pi/3)
- const double sin_pi_3 = sqrt(0.75); // Sin(pi/3)
- for(int dir=0; dir<4; ++dir){
- for(int i=0; i<V; ++i){
- for(int j=0; j<gaugeSiteSize; ++j){
- if(gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){
- ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j] = 0.5*rand()/RAND_MAX;
- if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links
- if(j%2 == 0){
- const double real = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j];
- const double imag = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1];
- ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3;
- ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
- }
- }
- ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j];
- ((double*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j];
- }else{
- ((float*)qdp_fatlink[dir])[i] = 0.5*rand()/RAND_MAX;
- if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links
- if(j%2 == 0){
- const float real = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j];
- const float imag = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1];
- ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3;
- ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
- }
- }
- ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j];
- ((float*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_fatlink[dir])[i*gaugeSiteSize + j];
- ((float*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j];
- }
- }
- }
- }
- ColorSpinorParam csParam;
- csParam.nColor=3;
- csParam.nSpin=1;
- csParam.nDim=4;
- for(int d = 0; d < 4; d++) {
- csParam.x[d] = gaugeParam.X[d];
- }
- csParam.x[0] /= 2;
- csParam.precision = inv_param.cpu_prec;
- csParam.pad = 0;
- csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
- csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
- csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
- csParam.gammaBasis = inv_param.gamma_basis;
- csParam.create = QUDA_ZERO_FIELD_CREATE;
- in = new cpuColorSpinorField(csParam);
- out = new cpuColorSpinorField(csParam);
- ref = new cpuColorSpinorField(csParam);
- tmp = new cpuColorSpinorField(csParam);
- if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){
- constructSpinorField((float*)in->V());
- }else{
- constructSpinorField((double*)in->V());
- }
- #ifdef MULTI_GPU
- int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2);
- tmp_value = MAX(tmp_value, xdim*ydim*tdim/2);
- tmp_value = MAX(tmp_value, xdim*ydim*zdim/2);
- int fat_pad = tmp_value;
- int link_pad = 3*tmp_value;
- gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
- gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
- GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
- cpuFat = new cpuGaugeField(cpuFatParam);
- ghost_fatlink = (void**)cpuFat->Ghost();
- gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
- GaugeFieldParam cpuLongParam(longlink, gaugeParam);
- cpuLong = new cpuGaugeField(cpuLongParam);
- ghost_longlink = (void**)cpuLong->Ghost();
- gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
- gaugeParam.ga_pad = fat_pad;
- gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
- loadGaugeQuda(fatlink, &gaugeParam);
- gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
- gaugeParam.ga_pad = link_pad;
- gaugeParam.reconstruct= link_recon;
- gaugeParam.reconstruct_sloppy = link_recon_sloppy;
- loadGaugeQuda(longlink, &gaugeParam);
- #else
- gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
- gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
- loadGaugeQuda(fatlink, &gaugeParam);
- gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
- gaugeParam.reconstruct = link_recon;
- gaugeParam.reconstruct_sloppy = link_recon_sloppy;
- loadGaugeQuda(longlink, &gaugeParam);
- #endif
- double time0 = -((double)clock()); // Start the timer
- double nrm2=0;
- double src2=0;
- int ret = 0;
- switch(test_type){
- case 0: //even
- inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
- invertQuda(out->V(), in->V(), &inv_param);
- time0 += clock();
- time0 /= CLOCKS_PER_SEC;
- #ifdef MULTI_GPU
- matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink,
- out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY);
- #else
- matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY);
- #endif
- mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
- nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
- src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
- break;
- case 1: //odd
- inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
- invertQuda(out->V(), in->V(), &inv_param);
- time0 += clock(); // stop the timer
- time0 /= CLOCKS_PER_SEC;
- #ifdef MULTI_GPU
- matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink,
- out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY);
- #else
- matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY);
- #endif
- mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
- nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
- src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
- break;
- case 2: //full spinor
- errorQuda("full spinor not supported\n");
- break;
- case 3: //multi mass CG, even
- case 4:
- #define NUM_OFFSETS 12
- {
- double masses[NUM_OFFSETS] ={0.002, 0.0021, 0.0064, 0.070, 0.077, 0.081, 0.1, 0.11, 0.12, 0.13, 0.14, 0.205};
- inv_param.num_offset = NUM_OFFSETS;
- // these can be set independently
- for (int i=0; i<inv_param.num_offset; i++) {
- inv_param.tol_offset[i] = inv_param.tol;
- inv_param.tol_hq_offset[i] = inv_param.tol_hq;
- }
- void* outArray[NUM_OFFSETS];
- int len;
- cpuColorSpinorField* spinorOutArray[NUM_OFFSETS];
- spinorOutArray[0] = out;
- for(int i=1;i < inv_param.num_offset; i++){
- spinorOutArray[i] = new cpuColorSpinorField(csParam);
- }
- for(int i=0;i < inv_param.num_offset; i++){
- outArray[i] = spinorOutArray[i]->V();
- inv_param.offset[i] = 4*masses[i]*masses[i];
- }
- len=Vh;
- if (test_type == 3) {
- inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
- } else {
- inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
- }
- invertMultiShiftQuda(outArray, in->V(), &inv_param);
- cudaDeviceSynchronize();
- time0 += clock(); // stop the timer
- time0 /= CLOCKS_PER_SEC;
- printfQuda("done: total time = %g secs, compute time = %g, %i iter / %g secs = %g gflops\n",
- time0, inv_param.secs, inv_param.iter, inv_param.secs,
- inv_param.gflops/inv_param.secs);
- printfQuda("checking the solution\n");
- QudaParity parity = QUDA_INVALID_PARITY;
- if (inv_param.solve_type == QUDA_NORMOP_SOLVE){
- //parity = QUDA_EVENODD_PARITY;
- errorQuda("full parity not supported\n");
- }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){
- parity = QUDA_EVEN_PARITY;
- }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){
- parity = QUDA_ODD_PARITY;
- }else{
- errorQuda("ERROR: invalid spinor parity \n");
- exit(1);
- }
- for(int i=0;i < inv_param.num_offset;i++){
- printfQuda("%dth solution: mass=%f, ", i, masses[i]);
- #ifdef MULTI_GPU
- matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink,
- spinorOutArray[i], masses[i], 0, inv_param.cpu_prec,
- gaugeParam.cpu_prec, tmp, parity);
- #else
- matdagmat(ref->V(), qdp_fatlink, qdp_longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity);
- #endif
- mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
- double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
- double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
- double hqr = sqrt(HeavyQuarkResidualNormCpu(*spinorOutArray[i], *ref).z);
- double l2r = sqrt(nrm2/src2);
- printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n",
- i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r,
- inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr);
- //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge
- if (sqrt(nrm2/src2) > 10*inv_param.tol_offset[i]){
- ret |=1;
- }
- }
- for(int i=1; i < inv_param.num_offset;i++) delete spinorOutArray[i];
- }
- break;
- default:
- errorQuda("Unsupported test type");
- }//switch
- if (test_type <=2){
- double hqr = sqrt(HeavyQuarkResidualNormCpu(*out, *ref).z);
- double l2r = sqrt(nrm2/src2);
- printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n",
- inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr);
- printfQuda("done: total time = %g secs, compute time = %g secs, %i iter / %g secs = %g gflops, \n",
- time0, inv_param.secs, inv_param.iter, inv_param.secs,
- inv_param.gflops/inv_param.secs);
- }
- end();
- return ret;
- }
- static void
- end(void)
- {
- for(int i=0;i < 4;i++){
- free(qdp_fatlink[i]);
- free(qdp_longlink[i]);
- }
- free(fatlink);
- free(longlink);
- delete in;
- delete out;
- delete ref;
- delete tmp;
- if (cpuFat) delete cpuFat;
- if (cpuLong) delete cpuLong;
- endQuda();
- }
- void
- display_test_info()
- {
- printfQuda("running the following test:\n");
- printfQuda("prec sloppy_prec link_recon sloppy_link_recon test_type S_dimension T_dimension\n");
- printfQuda("%s %s %s %s %s %d/%d/%d %d \n",
- get_prec_str(prec),get_prec_str(prec_sloppy),
- get_recon_str(link_recon),
- get_recon_str(link_recon_sloppy), get_test_type(test_type), xdim, ydim, zdim, tdim);
- printfQuda("Grid partition info: X Y Z T\n");
- printfQuda(" %d %d %d %d\n",
- dimPartitioned(0),
- dimPartitioned(1),
- dimPartitioned(2),
- dimPartitioned(3));
- return ;
- }
- void
- usage_extra(char** argv )
- {
- printfQuda("Extra options:\n");
- printfQuda(" --tol <resid_tol> # Set residual tolerance\n");
- printfQuda(" --test <0/1> # Test method\n");
- printfQuda(" 0: Even even spinor CG inverter\n");
- printfQuda(" 1: Odd odd spinor CG inverter\n");
- printfQuda(" 3: Even even spinor multishift CG inverter\n");
- printfQuda(" 4: Odd odd spinor multishift CG inverter\n");
- printfQuda(" --cpu_prec <double/single/half> # Set CPU precision\n");
- return ;
- }
- int main(int argc, char** argv)
- {
- for (int i = 1; i < argc; i++) {
- if(process_command_line_option(argc, argv, &i) == 0){
- continue;
- }
- if( strcmp(argv[i], "--tol") == 0){
- float tmpf;
- if (i+1 >= argc){
- usage(argv);
- }
- sscanf(argv[i+1], "%f", &tmpf);
- if (tmpf <= 0){
- printf("ERROR: invalid tol(%f)\n", tmpf);
- usage(argv);
- }
- tol = tmpf;
- i++;
- continue;
- }
- if( strcmp(argv[i], "--cpu_prec") == 0){
- if (i+1 >= argc){
- usage(argv);
- }
- cpu_prec= get_prec(argv[i+1]);
- i++;
- continue;
- }
- printf("ERROR: Invalid option:%s\n", argv[i]);
- usage(argv);
- }
- if (prec_sloppy == QUDA_INVALID_PRECISION){
- prec_sloppy = prec;
- }
- if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID){
- link_recon_sloppy = link_recon;
- }
- // initialize QMP or MPI
- #if defined(QMP_COMMS)
- QMP_thread_level_t tl;
- QMP_init_msg_passing(&argc, &argv, QMP_THREAD_SINGLE, &tl);
- #elif defined(MPI_COMMS)
- MPI_Init(&argc, &argv);
- #endif
- // call srand() with a rank-dependent seed
- initRand();
- display_test_info();
- int ret = invert_test();
- display_test_info();
- // finalize the communications layer
- #if defined(QMP_COMMS)
- QMP_finalize_msg_passing();
- #elif defined(MPI_COMMS)
- MPI_Finalize();
- #endif
- return ret;
- }