PageRenderTime 47ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/tests/hisq_paths_force_test.cpp

https://github.com/jinhou/quda
C++ | 775 lines | 582 code | 172 blank | 21 comment | 70 complexity | 8663c275598730975f848319bbec93af MD5 | raw file
Possible License(s): BSD-3-Clause
  1. #include <cstdio>
  2. #include <cstdlib>
  3. #include <cstring>
  4. #include <quda.h>
  5. #include "test_util.h"
  6. #include "gauge_field.h"
  7. #include "fat_force_quda.h"
  8. #include "misc.h"
  9. #include "hisq_force_reference.h"
  10. #include "hisq_force_quda.h"
  11. #include "hw_quda.h"
  12. #include <fat_force_quda.h>
  13. #include <face_quda.h>
  14. #include <dslash_quda.h>
  15. #include <sys/time.h>
  16. #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
  17. #include "fermion_force_reference.h"
  18. using namespace quda;
  19. extern void usage(char** argv);
  20. extern int device;
  21. cudaGaugeField *cudaGauge = NULL;
  22. cpuGaugeField *cpuGauge = NULL;
  23. cudaGaugeField *cudaForce = NULL;
  24. cpuGaugeField *cpuForce = NULL;
  25. cudaGaugeField *cudaMom = NULL;
  26. cpuGaugeField *cpuMom = NULL;
  27. cpuGaugeField *refMom = NULL;
  28. static QudaGaugeParam qudaGaugeParam;
  29. static QudaGaugeParam qudaGaugeParam_ex;
  30. static void* hw; // the array of half_wilson_vector
  31. QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER;
  32. cpuGaugeField *cpuOprod = NULL;
  33. cudaGaugeField *cudaOprod = NULL;
  34. cpuGaugeField *cpuLongLinkOprod = NULL;
  35. cudaGaugeField *cudaLongLinkOprod = NULL;
  36. extern bool verify_results;
  37. int ODD_BIT = 1;
  38. extern int xdim, ydim, zdim, tdim;
  39. extern int gridsize_from_cmdline[];
  40. extern bool tune;
  41. extern QudaPrecision prec;
  42. extern QudaReconstructType link_recon;
  43. QudaPrecision link_prec = QUDA_DOUBLE_PRECISION;
  44. QudaPrecision hw_prec = QUDA_DOUBLE_PRECISION;
  45. QudaPrecision cpu_hw_prec = QUDA_DOUBLE_PRECISION;
  46. QudaPrecision mom_prec = QUDA_DOUBLE_PRECISION;
  47. void* siteLink_1d;
  48. void* siteLink_2d[4];
  49. void* siteLink_ex_2d[4];
  50. cudaGaugeField *cudaGauge_ex = NULL;
  51. cpuGaugeField *cpuGauge_ex = NULL;
  52. cudaGaugeField *cudaForce_ex = NULL;
  53. cpuGaugeField *cpuForce_ex = NULL;
  54. cpuGaugeField *cpuOprod_ex = NULL;
  55. cudaGaugeField *cudaOprod_ex = NULL;
  56. cpuGaugeField *cpuLongLinkOprod_ex = NULL;
  57. cudaGaugeField *cudaLongLinkOprod_ex = NULL;
  58. #ifdef MULTI_GPU
  59. GaugeFieldParam gParam_ex;
  60. #endif
  61. GaugeFieldParam gParam;
  62. static void setPrecision(QudaPrecision precision)
  63. {
  64. link_prec = precision;
  65. hw_prec = precision;
  66. cpu_hw_prec = precision;
  67. mom_prec = precision;
  68. return;
  69. }
  70. void
  71. total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double* io, double* flops)
  72. {
  73. //total IO counting for the middle/side/all link kernels
  74. //Explanation about these numbers can be founed in the corresnponding kernel functions in
  75. //the hisq kernel core file
  76. int linksize = prec*recon;
  77. int cmsize = prec*18;
  78. int matrix_mul_flops = 198;
  79. int matrix_add_flops = 18;
  80. int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24};
  81. int middle_link_data_io[6][2] = {
  82. {3,6},
  83. {3,4},
  84. {3,7},
  85. {3,5},
  86. {3,5},
  87. {3,2}
  88. };
  89. int middle_link_data_flops[6][2] = {
  90. {3,1},
  91. {2,0},
  92. {4,1},
  93. {3,0},
  94. {4,1},
  95. {2,0}
  96. };
  97. int num_calls_side_link[2]= {192, 48};
  98. int side_link_data_io[2][2] = {
  99. {1, 6},
  100. {0, 3}
  101. };
  102. int side_link_data_flops[2][2] = {
  103. {2, 2},
  104. {0, 1}
  105. };
  106. int num_calls_all_link[2] ={192, 192};
  107. int all_link_data_io[2][2] = {
  108. {3, 8},
  109. {3, 6}
  110. };
  111. int all_link_data_flops[2][2] = {
  112. {6, 3},
  113. {4, 2}
  114. };
  115. double total_io = 0;
  116. for(int i = 0;i < 6; i++){
  117. total_io += num_calls_middle_link[i]
  118. *(middle_link_data_io[i][0]*linksize + middle_link_data_io[i][1]*cmsize);
  119. }
  120. for(int i = 0;i < 2; i++){
  121. total_io += num_calls_side_link[i]
  122. *(side_link_data_io[i][0]*linksize + side_link_data_io[i][1]*cmsize);
  123. }
  124. for(int i = 0;i < 2; i++){
  125. total_io += num_calls_all_link[i]
  126. *(all_link_data_io[i][0]*linksize + all_link_data_io[i][1]*cmsize);
  127. }
  128. total_io *= V;
  129. double total_flops = 0;
  130. for(int i = 0;i < 6; i++){
  131. total_flops += num_calls_middle_link[i]
  132. *(middle_link_data_flops[i][0]*matrix_mul_flops + middle_link_data_flops[i][1]*matrix_add_flops);
  133. }
  134. for(int i = 0;i < 2; i++){
  135. total_flops += num_calls_side_link[i]
  136. *(side_link_data_flops[i][0]*matrix_mul_flops + side_link_data_flops[i][1]*matrix_add_flops);
  137. }
  138. for(int i = 0;i < 2; i++){
  139. total_flops += num_calls_all_link[i]
  140. *(all_link_data_flops[i][0]*matrix_mul_flops + all_link_data_flops[i][1]*matrix_add_flops);
  141. }
  142. total_flops *= V;
  143. *io=total_io;
  144. *flops = total_flops;
  145. printfQuda("flop/byte =%.1f\n", total_flops/total_io);
  146. return ;
  147. }
  148. // allocate memory
  149. // set the layout, etc.
  150. static void
  151. hisq_force_init()
  152. {
  153. initQuda(device);
  154. qudaGaugeParam.X[0] = xdim;
  155. qudaGaugeParam.X[1] = ydim;
  156. qudaGaugeParam.X[2] = zdim;
  157. qudaGaugeParam.X[3] = tdim;
  158. setDims(qudaGaugeParam.X);
  159. qudaGaugeParam.cpu_prec = link_prec;
  160. qudaGaugeParam.cuda_prec = link_prec;
  161. qudaGaugeParam.reconstruct = link_recon;
  162. // qudaGaugeParam.gauge_order = QUDA_MILC_GAUGE_ORDER;
  163. qudaGaugeParam.gauge_order = gauge_order;
  164. qudaGaugeParam.anisotropy = 1.0;
  165. memcpy(&qudaGaugeParam_ex, &qudaGaugeParam, sizeof(QudaGaugeParam));
  166. qudaGaugeParam_ex.X[0] = qudaGaugeParam.X[0] + 4;
  167. qudaGaugeParam_ex.X[1] = qudaGaugeParam.X[1] + 4;
  168. qudaGaugeParam_ex.X[2] = qudaGaugeParam.X[2] + 4;
  169. qudaGaugeParam_ex.X[3] = qudaGaugeParam.X[3] + 4;
  170. gParam = GaugeFieldParam(0, qudaGaugeParam);
  171. gParam.create = QUDA_NULL_FIELD_CREATE;
  172. gParam.link_type = QUDA_GENERAL_LINKS;
  173. gParam.order = QUDA_QDP_GAUGE_ORDER;
  174. cpuGauge = new cpuGaugeField(gParam);
  175. #ifdef MULTI_GPU
  176. gParam_ex = GaugeFieldParam(0, qudaGaugeParam_ex);
  177. gParam_ex.create = QUDA_NULL_FIELD_CREATE;
  178. gParam_ex.link_type = QUDA_GENERAL_LINKS;
  179. gParam_ex.order = QUDA_QDP_GAUGE_ORDER;
  180. cpuGauge_ex = new cpuGaugeField(gParam_ex);
  181. #endif
  182. int gSize = qudaGaugeParam.cpu_prec;
  183. // this is a hack to get the gauge field to appear as a void** rather than void*
  184. for(int i=0;i < 4;i++){
  185. #ifdef GPU_DIRECT
  186. if(cudaMallocHost(&siteLink_2d[i], V*gaugeSiteSize* qudaGaugeParam.cpu_prec) == cudaErrorMemoryAllocation) {
  187. errorQuda("ERROR: cudaMallocHost failed for sitelink_2d\n");
  188. }
  189. if(cudaMallocHost((void**)&siteLink_ex_2d[i], V_ex*gaugeSiteSize*qudaGaugeParam.cpu_prec) == cudaErrorMemoryAllocation) {
  190. errorQuda("ERROR: cudaMallocHost failed for sitelink_ex_2d\n");
  191. }
  192. #else
  193. siteLink_2d[i] = malloc(V*gaugeSiteSize* qudaGaugeParam.cpu_prec);
  194. siteLink_ex_2d[i] = malloc(V_ex*gaugeSiteSize*qudaGaugeParam.cpu_prec);
  195. #endif
  196. if(siteLink_2d[i] == NULL || siteLink_ex_2d[i] == NULL){
  197. errorQuda("malloc failed for siteLink_2d/siteLink_ex_2d\n");
  198. }
  199. memset(siteLink_2d[i], 0, V*gaugeSiteSize* qudaGaugeParam.cpu_prec);
  200. memset(siteLink_ex_2d[i], 0, V_ex*gaugeSiteSize*qudaGaugeParam.cpu_prec);
  201. }
  202. //siteLink_1d is only used in fermion reference computation
  203. siteLink_1d = malloc(4*V*gaugeSiteSize* qudaGaugeParam.cpu_prec);
  204. // fills the gauge field with random numbers
  205. createSiteLinkCPU(siteLink_2d, qudaGaugeParam.cpu_prec, 1);
  206. int X1 = Z[0];
  207. int X2 = Z[1];
  208. int X3 = Z[2];
  209. int X4 = Z[3];
  210. for(int i=0; i < V_ex; i++){
  211. int sid = i;
  212. int oddBit=0;
  213. if(i >= Vh_ex){
  214. sid = i - Vh_ex;
  215. oddBit = 1;
  216. }
  217. int za = sid/E1h;
  218. int x1h = sid - za*E1h;
  219. int zb = za/E2;
  220. int x2 = za - zb*E2;
  221. int x4 = zb/E3;
  222. int x3 = zb - x4*E3;
  223. int x1odd = (x2 + x3 + x4 + oddBit) & 1;
  224. int x1 = 2*x1h + x1odd;
  225. if( x1< 2 || x1 >= X1 +2
  226. || x2< 2 || x2 >= X2 +2
  227. || x3< 2 || x3 >= X3 +2
  228. || x4< 2 || x4 >= X4 +2){
  229. continue;
  230. }
  231. x1 = (x1 - 2 + X1) % X1;
  232. x2 = (x2 - 2 + X2) % X2;
  233. x3 = (x3 - 2 + X3) % X3;
  234. x4 = (x4 - 2 + X4) % X4;
  235. int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
  236. if(oddBit){
  237. idx += Vh;
  238. }
  239. for(int dir= 0; dir < 4; dir++){
  240. char* src = (char*)siteLink_2d[dir];
  241. char* dst = (char*)siteLink_ex_2d[dir];
  242. memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
  243. }//dir
  244. }//i
  245. for(int dir = 0; dir < 4; dir++){
  246. for(int i = 0;i < V; i++){
  247. char* src = (char*)siteLink_2d[dir];
  248. char* dst = (char*)siteLink_1d;
  249. memcpy(dst + (4*i+dir)*gaugeSiteSize*link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize \
  250. *link_prec);
  251. }
  252. }
  253. if(gauge_order == QUDA_MILC_GAUGE_ORDER){
  254. for(int dir = 0; dir < 4; dir++){
  255. for(int i = 0;i < V; i++){
  256. char* src = (char*)siteLink_2d[dir];
  257. char* dst = (char*)cpuGauge->Gauge_p();
  258. memcpy(dst + (4*i+dir)*gaugeSiteSize*link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize*link_prec);
  259. }
  260. }
  261. }else{
  262. for(int dir=0;dir < 4; dir++){
  263. char* src = (char*)siteLink_2d[dir];
  264. char* dst = ((char**)cpuGauge->Gauge_p())[dir];
  265. memcpy(dst, src, V*gaugeSiteSize*link_prec);
  266. }
  267. }
  268. #ifdef MULTI_GPU
  269. //for multi-gpu we have to use qdp format now
  270. if(gauge_order == QUDA_MILC_GAUGE_ORDER){
  271. errorQuda("multi_gpu milc is not supported\n");
  272. }
  273. for(int dir=0;dir < 4; dir++){
  274. char* src = (char*)siteLink_ex_2d[dir];
  275. char* dst = ((char**)cpuGauge_ex->Gauge_p())[dir];
  276. memcpy(dst, src, V_ex*gaugeSiteSize*link_prec);
  277. }
  278. #endif
  279. #ifdef MULTI_GPU
  280. gParam_ex.precision = prec;
  281. gParam_ex.reconstruct = link_recon;
  282. //gParam_ex.pad = E1*E2*E3/2;
  283. gParam_ex.pad = 0;
  284. gParam_ex.ghostInit = false;
  285. gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
  286. cudaGauge_ex = new cudaGaugeField(gParam_ex);
  287. qudaGaugeParam.site_ga_pad = gParam_ex.pad;
  288. //record gauge pad size
  289. #else
  290. gParam.precision = qudaGaugeParam.cuda_prec;
  291. gParam.reconstruct = link_recon;
  292. gParam.pad = X1*X2*X3/2;
  293. gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
  294. cudaGauge = new cudaGaugeField(gParam);
  295. //record gauge pad size
  296. qudaGaugeParam.site_ga_pad = gParam.pad;
  297. #endif
  298. #ifdef MULTI_GPU
  299. gParam_ex.pad = 0;
  300. gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
  301. gParam_ex.create = QUDA_ZERO_FIELD_CREATE;
  302. gParam_ex.order = QUDA_QDP_GAUGE_ORDER;
  303. cpuForce_ex = new cpuGaugeField(gParam_ex);
  304. gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
  305. gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
  306. cudaForce_ex = new cudaGaugeField(gParam_ex);
  307. #else
  308. gParam.pad = 0;
  309. gParam.reconstruct = QUDA_RECONSTRUCT_NO;
  310. gParam.create = QUDA_ZERO_FIELD_CREATE;
  311. gParam.order = gauge_order;
  312. cpuForce = new cpuGaugeField(gParam);
  313. gParam.reconstruct = QUDA_RECONSTRUCT_NO;
  314. gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
  315. cudaForce = new cudaGaugeField(gParam);
  316. #endif
  317. // create the momentum matrix
  318. gParam.pad = 0;
  319. gParam.reconstruct = QUDA_RECONSTRUCT_10;
  320. gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
  321. gParam.order = QUDA_MILC_GAUGE_ORDER;
  322. gParam.create = QUDA_ZERO_FIELD_CREATE;
  323. cpuMom = new cpuGaugeField(gParam);
  324. refMom = new cpuGaugeField(gParam);
  325. //createMomCPU(cpuMom->Gauge_p(), mom_prec);
  326. hw = malloc(4*cpuGauge->Volume()*hwSiteSize*qudaGaugeParam.cpu_prec);
  327. if (hw == NULL){
  328. fprintf(stderr, "ERROR: malloc failed for hw\n");
  329. exit(1);
  330. }
  331. createHwCPU(hw, hw_prec);
  332. gParam.link_type = QUDA_GENERAL_LINKS;
  333. gParam.reconstruct = QUDA_RECONSTRUCT_NO;
  334. gParam.order = gauge_order;
  335. gParam.pad = 0;
  336. cpuOprod = new cpuGaugeField(gParam);
  337. computeLinkOrderedOuterProduct(hw, cpuOprod->Gauge_p(), hw_prec, 1, gauge_order);
  338. cpuLongLinkOprod = new cpuGaugeField(gParam);
  339. computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod->Gauge_p(), hw_prec, 3, gauge_order);
  340. #ifdef MULTI_GPU
  341. gParam_ex.link_type = QUDA_GENERAL_LINKS;
  342. gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO;
  343. gParam_ex.order = gauge_order;
  344. cpuOprod_ex = new cpuGaugeField(gParam_ex);
  345. //computeLinkOrderedOuterProduct(hw, cpuOprod_ex->Gauge_p(), hw_prec, 1, gauge_order);
  346. cpuLongLinkOprod_ex = new cpuGaugeField(gParam_ex);
  347. //computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod_ex->Gauge_p(), hw_prec, 3, gauge_order);
  348. for(int i=0; i < V_ex; i++){
  349. int sid = i;
  350. int oddBit=0;
  351. if(i >= Vh_ex){
  352. sid = i - Vh_ex;
  353. oddBit = 1;
  354. }
  355. int za = sid/E1h;
  356. int x1h = sid - za*E1h;
  357. int zb = za/E2;
  358. int x2 = za - zb*E2;
  359. int x4 = zb/E3;
  360. int x3 = zb - x4*E3;
  361. int x1odd = (x2 + x3 + x4 + oddBit) & 1;
  362. int x1 = 2*x1h + x1odd;
  363. if( x1< 2 || x1 >= X1 +2
  364. || x2< 2 || x2 >= X2 +2
  365. || x3< 2 || x3 >= X3 +2
  366. || x4< 2 || x4 >= X4 +2){
  367. continue;
  368. }
  369. x1 = (x1 - 2 + X1) % X1;
  370. x2 = (x2 - 2 + X2) % X2;
  371. x3 = (x3 - 2 + X3) % X3;
  372. x4 = (x4 - 2 + X4) % X4;
  373. int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
  374. if(oddBit){
  375. idx += Vh;
  376. }
  377. for(int dir= 0; dir < 4; dir++){
  378. char* src = ((char**)cpuOprod->Gauge_p())[dir];
  379. char* dst = ((char**)cpuOprod_ex->Gauge_p())[dir];
  380. memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
  381. src = ((char**)cpuLongLinkOprod->Gauge_p())[dir];
  382. dst = ((char**)cpuLongLinkOprod_ex->Gauge_p())[dir];
  383. memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
  384. }//dir
  385. }//i
  386. gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
  387. cudaOprod_ex = new cudaGaugeField(gParam_ex);
  388. gParam_ex.order = gauge_order;
  389. #else
  390. gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
  391. cudaOprod = new cudaGaugeField(gParam);
  392. cudaLongLinkOprod = new cudaGaugeField(gParam);
  393. #endif
  394. return;
  395. }
  396. static void
  397. hisq_force_end()
  398. {
  399. for(int i = 0;i < 4; i++){
  400. #ifdef GPU_DIRECT
  401. cudaFreeHost(siteLink_2d[i]);
  402. cudaFreeHost(siteLink_ex_2d[i]);
  403. #else
  404. free(siteLink_2d[i]);
  405. free(siteLink_ex_2d[i]);
  406. #endif
  407. }
  408. free(siteLink_1d);
  409. delete cudaMom;
  410. delete cudaGauge;
  411. #ifdef MULTI_GPU
  412. delete cudaForce_ex;
  413. delete cudaGauge_ex;
  414. //delete cudaOprod_ex; // already deleted
  415. delete cudaLongLinkOprod_ex;
  416. #else
  417. delete cudaForce;
  418. delete cudaOprod;
  419. delete cudaLongLinkOprod;
  420. #endif
  421. delete cpuGauge;
  422. delete cpuMom;
  423. delete refMom;
  424. delete cpuOprod;
  425. delete cpuLongLinkOprod;
  426. #ifdef MULTI_GPU
  427. delete cpuGauge_ex;
  428. delete cpuForce_ex;
  429. delete cpuOprod_ex;
  430. delete cpuLongLinkOprod_ex;
  431. #else
  432. delete cpuForce;
  433. #endif
  434. free(hw);
  435. endQuda();
  436. return;
  437. }
  438. static int
  439. hisq_force_test(void)
  440. {
  441. tune = false;
  442. if (tune) setTuning(QUDA_TUNE_YES);
  443. setVerbosity(QUDA_VERBOSE);
  444. hisq_force_init();
  445. TimeProfile profile("dummy");
  446. initLatticeConstants(*cpuMom, profile);
  447. fermion_force::hisqForceInitCuda(&qudaGaugeParam);
  448. //float weight = 1.0;
  449. float act_path_coeff[6];
  450. act_path_coeff[0] = 0.625000;
  451. act_path_coeff[1] = -0.058479;
  452. act_path_coeff[2] = -0.087719;
  453. act_path_coeff[3] = 0.030778;
  454. act_path_coeff[4] = -0.007200;
  455. act_path_coeff[5] = -0.123113;
  456. //double d_weight = 1.0;
  457. double d_act_path_coeff[6];
  458. for(int i=0; i<6; ++i){
  459. d_act_path_coeff[i] = act_path_coeff[i];
  460. }
  461. #ifdef MULTI_GPU
  462. int optflag = 0;
  463. int R[4] = {2, 2, 2, 2};
  464. exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)cpuGauge_ex->Gauge_p(), cpuGauge->Order(), qudaGaugeParam.cpu_prec, optflag);
  465. loadLinkToGPU_ex(cudaGauge_ex, cpuGauge_ex);
  466. #else
  467. loadLinkToGPU(cudaGauge, cpuGauge, &qudaGaugeParam);
  468. #endif
  469. #ifdef MULTI_GPU
  470. exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)cpuOprod_ex->Gauge_p(), cpuOprod_ex->Order(), qudaGaugeParam.cpu_prec, optflag);
  471. loadLinkToGPU_ex(cudaOprod_ex, cpuOprod_ex);
  472. #else
  473. loadLinkToGPU(cudaOprod, cpuOprod, &qudaGaugeParam);
  474. #endif
  475. #ifdef MULTI_GPU
  476. exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)cpuLongLinkOprod_ex->Gauge_p(), cpuLongLinkOprod_ex->Order(), qudaGaugeParam.cpu_prec, optflag);
  477. #endif
  478. struct timeval ht0, ht1;
  479. gettimeofday(&ht0, NULL);
  480. if (verify_results){
  481. #ifdef MULTI_GPU
  482. hisqStaplesForceCPU(d_act_path_coeff, qudaGaugeParam, *cpuOprod_ex, *cpuGauge_ex, cpuForce_ex);
  483. hisqLongLinkForceCPU(d_act_path_coeff[1], qudaGaugeParam, *cpuLongLinkOprod_ex, *cpuGauge_ex, cpuForce_ex);
  484. hisqCompleteForceCPU(qudaGaugeParam, *cpuForce_ex, *cpuGauge_ex, refMom);
  485. #else
  486. hisqStaplesForceCPU(d_act_path_coeff, qudaGaugeParam, *cpuOprod, *cpuGauge, cpuForce);
  487. hisqLongLinkForceCPU(d_act_path_coeff[1], qudaGaugeParam, *cpuLongLinkOprod, *cpuGauge, cpuForce);
  488. hisqCompleteForceCPU(qudaGaugeParam, *cpuForce, *cpuGauge, refMom);
  489. #endif
  490. }
  491. gettimeofday(&ht1, NULL);
  492. struct timeval t0, t1, t2, t3;
  493. gettimeofday(&t0, NULL);
  494. #ifdef MULTI_GPU
  495. fermion_force::hisqStaplesForceCuda(d_act_path_coeff, qudaGaugeParam, *cudaOprod_ex, *cudaGauge_ex, cudaForce_ex);
  496. cudaDeviceSynchronize();
  497. gettimeofday(&t1, NULL);
  498. delete cudaOprod_ex; //doing this to lower the peak memory usage
  499. gParam_ex.order = QUDA_FLOAT2_GAUGE_ORDER;
  500. cudaLongLinkOprod_ex = new cudaGaugeField(gParam_ex);
  501. loadLinkToGPU_ex(cudaLongLinkOprod_ex, cpuLongLinkOprod_ex);
  502. fermion_force::hisqLongLinkForceCuda(d_act_path_coeff[1], qudaGaugeParam, *cudaLongLinkOprod_ex, *cudaGauge_ex, cudaForce_ex);
  503. cudaDeviceSynchronize();
  504. gettimeofday(&t2, NULL);
  505. #else
  506. fermion_force::hisqStaplesForceCuda(d_act_path_coeff, qudaGaugeParam, *cudaOprod, *cudaGauge, cudaForce);
  507. cudaDeviceSynchronize();
  508. gettimeofday(&t1, NULL);
  509. checkCudaError();
  510. loadLinkToGPU(cudaLongLinkOprod, cpuLongLinkOprod, &qudaGaugeParam);
  511. fermion_force::hisqLongLinkForceCuda(d_act_path_coeff[1], qudaGaugeParam, *cudaLongLinkOprod, *cudaGauge, cudaForce);
  512. cudaDeviceSynchronize();
  513. gettimeofday(&t2, NULL);
  514. #endif
  515. gParam.create = QUDA_NULL_FIELD_CREATE;
  516. gParam.reconstruct = QUDA_RECONSTRUCT_10;
  517. gParam.link_type = QUDA_ASQTAD_MOM_LINKS;
  518. gParam.pad = 0; //X1*X2*X3/2;
  519. gParam.order = QUDA_FLOAT2_GAUGE_ORDER;
  520. cudaMom = new cudaGaugeField(gParam); // Are the elements initialised to zero? - No!
  521. //record the mom pad
  522. qudaGaugeParam.mom_ga_pad = gParam.pad;
  523. #ifdef MULTI_GPU
  524. fermion_force::hisqCompleteForceCuda(qudaGaugeParam, *cudaForce_ex, *cudaGauge_ex, cudaMom);
  525. #else
  526. fermion_force::hisqCompleteForceCuda(qudaGaugeParam, *cudaForce, *cudaGauge, cudaMom);
  527. #endif
  528. cudaDeviceSynchronize();
  529. gettimeofday(&t3, NULL);
  530. checkCudaError();
  531. cudaMom->saveCPUField(*cpuMom, QUDA_CPU_FIELD_LOCATION);
  532. int accuracy_level = 3;
  533. if(verify_results){
  534. int res;
  535. res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume()*momSiteSize, 1e-5, qudaGaugeParam.cpu_prec);
  536. accuracy_level = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4*cpuMom->Volume(), qudaGaugeParam.cpu_prec);
  537. printfQuda("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
  538. }
  539. double total_io;
  540. double total_flops;
  541. total_staple_io_flops(link_prec, link_recon, &total_io, &total_flops);
  542. float perf_flops = total_flops / (TDIFF(t0, t1)) *1e-9;
  543. float perf = total_io / (TDIFF(t0, t1)) *1e-9;
  544. printfQuda("Staples time: %.2f ms, perf = %.2f GFLOPS, achieved bandwidth= %.2f GB/s\n", TDIFF(t0,t1)*1000, perf_flops, perf);
  545. printfQuda("Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n", TDIFF(t0,t1)*1000, TDIFF(t1,t2)*1000, TDIFF(t2,t3)*1000);
  546. printfQuda("Host time (half-wilson fermion force) : %g ms\n", TDIFF(ht0, ht1)*1000);
  547. hisq_force_end();
  548. return accuracy_level;
  549. }
  550. static void
  551. display_test_info()
  552. {
  553. printfQuda("running the following fermion force computation test:\n");
  554. printfQuda("link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order\n");
  555. printfQuda("%s %s %d/%d/%d %d %s\n",
  556. get_prec_str(link_prec),
  557. get_recon_str(link_recon),
  558. xdim, ydim, zdim, tdim,
  559. get_gauge_order_str(gauge_order));
  560. return ;
  561. }
  562. int
  563. main(int argc, char **argv)
  564. {
  565. int i;
  566. for (i =1;i < argc; i++){
  567. if(process_command_line_option(argc, argv, &i) == 0){
  568. continue;
  569. }
  570. if( strcmp(argv[i], "--gauge-order") == 0){
  571. if(i+1 >= argc){
  572. usage(argv);
  573. }
  574. if(strcmp(argv[i+1], "milc") == 0){
  575. gauge_order = QUDA_MILC_GAUGE_ORDER;
  576. }else if(strcmp(argv[i+1], "qdp") == 0){
  577. gauge_order = QUDA_QDP_GAUGE_ORDER;
  578. }else{
  579. fprintf(stderr, "Error: unsupported gauge-field order\n");
  580. exit(1);
  581. }
  582. i++;
  583. continue;
  584. }
  585. fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
  586. usage(argv);
  587. }
  588. #ifdef MULTI_GPU
  589. if(gauge_order == QUDA_MILC_GAUGE_ORDER){
  590. errorQuda("Multi-gpu for milc order is not supported\n");
  591. }
  592. initComms(argc, argv, gridsize_from_cmdline);
  593. #endif
  594. setPrecision(prec);
  595. display_test_info();
  596. int accuracy_level = hisq_force_test();
  597. finalizeComms();
  598. if(accuracy_level >=3 ){
  599. return EXIT_SUCCESS;
  600. }else{
  601. return -1;
  602. }
  603. }