PageRenderTime 46ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/wrfv2_fire/external/RSL_LITE/swap.c

http://github.com/jbeezley/wrf-fire
C | 347 lines | 316 code | 27 blank | 4 comment | 76 complexity | ce1515c2e32833fa273526bcf6b3b9f6 MD5 | raw file
Possible License(s): AGPL-1.0
  1. #ifndef MS_SUA
  2. # include <stdio.h>
  3. #endif
  4. #include <fcntl.h>
  5. #define STANDARD_ERROR 2
  6. #define STANDARD_OUTPUT 1
  7. #ifndef STUBMPI
  8. # include "mpi.h"
  9. #endif
  10. #include "rsl_lite.h"
  11. #define UP_EVEN(A) ((A)+abs((A)%2))
  12. #define DOWN_EVEN(A) ((A) - abs((A)%2))
  13. #define UP_ODD(A) ((A) + abs(((A)+1)%2))
  14. #define DOWN_ODD(A) ((A) - abs(((A)+1)%2))
  15. #define MIN(A,B) ((A)<(B)?(A):(B))
  16. #define MAX(A,B) ((A)>(B)?(A):(B))
  17. static int *y_curs = NULL ;
  18. static int *x_curs = NULL ;
  19. static int *x_peermask = NULL ;
  20. static int *nbytes = NULL ;
  21. #ifndef STUBMPI
  22. static MPI_Request *x_recv = NULL , *x_send = NULL ;
  23. #endif
  24. RSL_LITE_INIT_SWAP (
  25. int * Fcomm ,
  26. int * xy0 ,
  27. int * n3dR0, int *n2dR0, int * typesizeR0 ,
  28. int * n3dI0, int *n2dI0, int * typesizeI0 ,
  29. int * n3dD0, int *n2dD0, int * typesizeD0 ,
  30. int * n3dL0, int *n2dL0, int * typesizeL0 ,
  31. int * me0, int * np0 , int * np_x0 , int * np_y0 ,
  32. int * min_subdomain ,
  33. int * ids0 , int * ide0 , int * jds0 , int * jde0 , int * kds0 , int * kde0 ,
  34. int * ips0 , int * ipe0 , int * jps0 , int * jpe0 , int * kps0 , int * kpe0 )
  35. {
  36. #ifndef STUBMPI
  37. int n3dR, n2dR, typesizeR ;
  38. int n3dI, n2dI, typesizeI ;
  39. int n3dD, n2dD, typesizeD ;
  40. int n3dL, n2dL, typesizeL ;
  41. int xy ;
  42. int me, np, np_x, np_y ;
  43. int ids , ide , jds , jde , kds , kde ;
  44. int ips , ipe , jps , jpe , kps , kpe ;
  45. int ips_send , ipe_send ;
  46. int npts, i, ii, j, m, n, ps, pe, ops, ope ;
  47. int Px, Py, P, coords[2] ;
  48. int ips_swap, ipe_swap ;
  49. MPI_Comm *comm, dummy_comm ;
  50. int ierr ;
  51. comm = &dummy_comm ;
  52. *comm = MPI_Comm_f2c( *Fcomm ) ;
  53. xy = *xy0 ;
  54. n3dR = *n3dR0 ; n2dR = *n2dR0 ; typesizeR = *typesizeR0 ;
  55. n3dI = *n3dI0 ; n2dI = *n2dI0 ; typesizeI = *typesizeI0 ;
  56. n3dD = *n3dD0 ; n2dD = *n2dD0 ; typesizeD = *typesizeD0 ;
  57. n3dL = *n3dL0 ; n2dL = *n2dL0 ; typesizeL = *typesizeL0 ;
  58. me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
  59. ids = *ids0-1 ; ide = *ide0-1 ; jds = *jds0-1 ; jde = *jde0-1 ; kds = *kds0-1 ; kde = *kde0-1 ;
  60. ips = *ips0-1 ; ipe = *ipe0-1 ; jps = *jps0-1 ; jpe = *jpe0-1 ; kps = *kps0-1 ; kpe = *kpe0-1 ;
  61. if ( nbytes == NULL ) nbytes = RSL_MALLOC ( int , np ) ;
  62. if ( x_curs == NULL ) x_curs = RSL_MALLOC ( int , np ) ;
  63. if ( x_peermask == NULL ) x_peermask = RSL_MALLOC ( int , np ) ;
  64. if ( x_recv == NULL ) x_recv = RSL_MALLOC ( MPI_Request , np ) ;
  65. if ( x_send == NULL ) x_send = RSL_MALLOC ( MPI_Request , np ) ;
  66. for ( i = 0 ; i < np ; i++ ) { nbytes[i] = 0 ; x_curs[i] = 0 ; x_peermask[i] = 0 ; }
  67. if ( xy == 1 ) { /* xy = 1, swap in X, otherwise Y */
  68. n = (ide-ids+1)/4*2 ;
  69. m = n*2 ;
  70. ps = ips ;
  71. pe = ipe ;
  72. ops = jps ;
  73. ope = jpe ;
  74. } else {
  75. n = (jde-jds+1)/4*2 ;
  76. m = n*2 ;
  77. ps = jps ;
  78. pe = jpe ;
  79. ops = ips ;
  80. ope = ipe ;
  81. }
  82. for ( i = UP_ODD( ps ) ; i <= MIN(pe,m) ; i+=2 ) {
  83. ii = abs(i+n) % m ;
  84. if ( xy == 1 ) {
  85. TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
  86. min_subdomain, min_subdomain, &ierr ) ;
  87. coords[1] = Px ; coords[0] = Py ;
  88. MPI_Cart_rank( *comm, coords, &P ) ;
  89. } else {
  90. TASK_FOR_POINT ( &ips , &ii , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
  91. min_subdomain, min_subdomain, &ierr ) ;
  92. coords[1] = Px ; coords[0] = Py ;
  93. MPI_Cart_rank( *comm, coords, &P ) ;
  94. }
  95. nbytes[P] += typesizeR*(ope-ops+1)*(n3dR*(kpe-kps+1)+n2dR) +
  96. typesizeI*(ope-ops+1)*(n3dI*(kpe-kps+1)+n2dI) +
  97. typesizeD*(ope-ops+1)*(n3dD*(kpe-kps+1)+n2dD) +
  98. typesizeL*(ope-ops+1)*(n3dL*(kpe-kps+1)+n2dL) ;
  99. x_peermask[P] = 1 ;
  100. }
  101. for ( P = 0 ; P < np ; P++ ) {
  102. if ( x_peermask[P] ) {
  103. buffer_for_proc ( P , nbytes[P], RSL_RECVBUF ) ;
  104. buffer_for_proc ( P , nbytes[P], RSL_SENDBUF ) ;
  105. }
  106. }
  107. #endif
  108. }
  109. RSL_LITE_PACK_SWAP ( int * Fcomm , char * buf , int * odd0 , int * typesize0 , int * xy0 , int * pu0 , char * memord , int * xstag0 ,
  110. int *me0, int * np0 , int * np_x0 , int * np_y0 ,
  111. int * min_subdomain ,
  112. int * ids0 , int * ide0 , int * jds0 , int * jde0 , int * kds0 , int * kde0 ,
  113. int * ims0 , int * ime0 , int * jms0 , int * jme0 , int * kms0 , int * kme0 ,
  114. int * ips0 , int * ipe0 , int * jps0 , int * jpe0 , int * kps0 , int * kpe0 )
  115. {
  116. #ifndef STUBMPI
  117. int me, np, np_x, np_y ;
  118. int odd , typesize ;
  119. int ids , ide , jds , jde , kds , kde ;
  120. int ims , ime , jms , jme , kms , kme ;
  121. int ips , ipe , jps , jpe , kps , kpe ;
  122. int xstag ; /* 0 not stag, 1 stag */
  123. int xy ; /* y = 0 , x = 1 */
  124. int pu ; /* pack = 0 , unpack = 1 */
  125. int i, ii, j, jj, m, n ;
  126. int ps, pe, ops, ope ;
  127. register int k, t ;
  128. #ifdef crayx1
  129. register int i2,i3,i4,i_offset;
  130. #endif
  131. char *p ;
  132. int da_buf ;
  133. int Px, Py, P, coords[2] ;
  134. int ierr = 0 ;
  135. register int *pi, *qi ;
  136. float f ;
  137. MPI_Comm *comm, dummy_comm ;
  138. comm = &dummy_comm ;
  139. *comm = MPI_Comm_f2c( *Fcomm ) ;
  140. me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
  141. xstag = *xstag0 ;
  142. odd = *odd0 ; typesize = *typesize0 ;
  143. ids = *ids0-1 ; ide = *ide0-1 ; jds = *jds0-1 ; jde = *jde0-1 ; kds = *kds0-1 ; kde = *kde0-1 ;
  144. ims = *ims0-1 ; ime = *ime0-1 ; jms = *jms0-1 ; jme = *jme0-1 ; kms = *kms0-1 ; kme = *kme0-1 ;
  145. ips = *ips0-1 ; ipe = *ipe0-1 ; jps = *jps0-1 ; jpe = *jpe0-1 ; kps = *kps0-1 ; kpe = *kpe0-1 ;
  146. xy = *xy0 ;
  147. pu = *pu0 ;
  148. /* need to adapt for other memory orders */
  149. #define RANGE(S1,E1,S2,E2,S3,E3,S4,E4) (((E1)-(S1)+1)*((E2)-(S2)+1)*(((E3)-(S3)+1)/2)*((E4)-(S4)+1))
  150. #define IMAX(A) (((A)>ids)?(A):ids)
  151. #define IMIN(A) (((A)<ide)?(A):ide)
  152. #define JMAX(A) (((A)>jds)?(A):jds)
  153. #define JMIN(A) (((A)<jde)?(A):jde)
  154. da_buf = ( pu == 0 ) ? RSL_SENDBUF : RSL_RECVBUF ;
  155. if ( xy == 1 ) { /* xy = 1, swap in X, otherwise Y */
  156. n = (ide-ids+1)/4*2 ;
  157. m = n*2 ;
  158. } else {
  159. n = (jde-jds+1)/4*2 ;
  160. m = n*2 ;
  161. }
  162. if ( np_x > 1 && xy == 1 ) {
  163. for ( i = UP_ODD(ips) ; i <= MIN(ipe,m) ; i+=2 ) {
  164. ii = abs(i+n) % m ;
  165. TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
  166. min_subdomain, min_subdomain, &ierr ) ;
  167. coords[1] = Px ; coords[0] = Py ;
  168. MPI_Cart_rank( *comm, coords, &P ) ;
  169. p = buffer_for_proc( P , 0 , da_buf ) ;
  170. if ( pu == 0 ) {
  171. if ( typesize == sizeof(int) ) {
  172. for ( j = JMAX(jps) ; j <= JMIN(jpe) ; j++ ) {
  173. for ( k = kps ; k <= kpe ; k++ ) {
  174. pi = (int *)(p+x_curs[P]) ;
  175. qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
  176. (k-kms) + (j-jms)*(kme-kms+1))))) ;
  177. *pi++ = *qi++ ;
  178. x_curs[P] += typesize ;
  179. }
  180. }
  181. }
  182. else {
  183. for ( j = JMAX(jps) ; j <= JMIN(jpe) ; j++ ) {
  184. for ( k = kps ; k <= kpe ; k++ ) {
  185. for ( t = 0 ; t < typesize ; t++ ) {
  186. *(p+x_curs[P]) =
  187. *(buf + t + typesize*(
  188. (i-ims) + (ime-ims+1)*(
  189. (k-kms) + (j-jms)*(kme-kms+1))) ) ;
  190. x_curs[P]++ ;
  191. }
  192. }
  193. }
  194. }
  195. } else {
  196. if ( typesize == sizeof(int) ) {
  197. for ( j = JMAX(jps) ; j <= JMIN(jpe) ; j++ ) {
  198. for ( k = kps ; k <= kpe ; k++ ) {
  199. pi = (int *)(p+x_curs[P]) ;
  200. qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
  201. (k-kms) + (j-jms)*(kme-kms+1))))) ;
  202. *qi++ = *pi++ ;
  203. x_curs[P] += typesize ;
  204. }
  205. }
  206. }
  207. else {
  208. for ( j = JMAX(jps) ; j <= JMIN(jpe) ; j++ ) {
  209. for ( k = kps ; k <= kpe ; k++ ) {
  210. for ( t = 0 ; t < typesize ; t++ ) {
  211. *(buf + t + typesize*(
  212. (i-ims) + (ime-ims+1)*(
  213. (k-kms) + (j-jms)*(kme-kms+1))) ) =
  214. *(p+x_curs[P]) ;
  215. x_curs[P]++ ;
  216. }
  217. }
  218. }
  219. }
  220. }
  221. }
  222. } else if ( np_y > 1 && xy == 0 ) {
  223. for ( j = UP_ODD(jps) ; j <= MIN(jpe,m) ; j+=2 ) {
  224. jj = abs(j+n) % m ;
  225. TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
  226. min_subdomain, min_subdomain, &ierr ) ;
  227. coords[1] = Px ; coords[0] = Py ;
  228. MPI_Cart_rank( *comm, coords, &P ) ;
  229. p = buffer_for_proc( P , 0 , da_buf ) ;
  230. if ( pu == 0 ) {
  231. if ( typesize == sizeof(int) ) {
  232. for ( i = IMAX(ips) ; i <= IMIN(ipe) ; i++ ) {
  233. for ( k = kps ; k <= kpe ; k++ ) {
  234. pi = (int *)(p+x_curs[P]) ;
  235. qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
  236. (k-kms) + (j-jms)*(kme-kms+1))))) ;
  237. *pi++ = *qi++ ;
  238. x_curs[P] += typesize ;
  239. }
  240. }
  241. }
  242. else {
  243. for ( i = IMAX(ips) ; i <= IMIN(ipe) ; i++ ) {
  244. for ( k = kps ; k <= kpe ; k++ ) {
  245. for ( t = 0 ; t < typesize ; t++ ) {
  246. *(p+x_curs[P]) =
  247. *(buf + t + typesize*(
  248. (i-ims) + (ime-ims+1)*(
  249. (k-kms) + (j-jms)*(kme-kms+1))) ) ;
  250. x_curs[P]++ ;
  251. }
  252. }
  253. }
  254. }
  255. } else {
  256. if ( typesize == sizeof(int) ) {
  257. for ( i = IMAX(ips) ; i <= IMIN(ipe) ; i++ ) {
  258. for ( k = kps ; k <= kpe ; k++ ) {
  259. pi = (int *)(p+x_curs[P]) ;
  260. qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
  261. (k-kms) + (j-jms)*(kme-kms+1))))) ;
  262. *qi++ = *pi++ ;
  263. x_curs[P] += typesize ;
  264. }
  265. }
  266. }
  267. else {
  268. for ( i = IMAX(ips) ; i <= IMIN(ipe) ; i++ ) {
  269. for ( k = kps ; k <= kpe ; k++ ) {
  270. for ( t = 0 ; t < typesize ; t++ ) {
  271. *(buf + t + typesize*(
  272. (i-ims) + (ime-ims+1)*(
  273. (k-kms) + (j-jms)*(kme-kms+1))) ) =
  274. *(p+x_curs[P]) ;
  275. x_curs[P]++ ;
  276. }
  277. }
  278. }
  279. }
  280. }
  281. }
  282. }
  283. #endif
  284. }
  285. RSL_LITE_SWAP ( int * Fcomm0, int *me0, int * np0 , int * np_x0 , int * np_y0 )
  286. {
  287. #ifndef STUBMPI
  288. int me, np, np_x, np_y ;
  289. int yp, ym, xp, xm, nb ;
  290. MPI_Status stat ;
  291. MPI_Comm comm, *comm0, dummy_comm ;
  292. int i, P ;
  293. comm0 = &dummy_comm ;
  294. *comm0 = MPI_Comm_f2c( *Fcomm0 ) ;
  295. #if 1
  296. comm = *comm0 ; me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
  297. /* fprintf(stderr,"RSL_LITE_SWAP\n") ; */
  298. for ( P = 0 ; P < np ; P++ ) {
  299. if ( x_peermask[P] ) {
  300. nb = buffer_size_for_proc( P, RSL_RECVBUF ) ;
  301. /* fprintf(stderr,"posting irecv from %d, nb = %d\n",P,nb) ; */
  302. MPI_Irecv ( buffer_for_proc( P, x_curs[P], RSL_RECVBUF ), nb, MPI_CHAR, P, me, comm, &(x_recv[P]) ) ;
  303. /* fprintf(stderr,"sending to %d, nb = %d\n",P,x_curs[P]) ; */
  304. MPI_Isend ( buffer_for_proc( P, 0, RSL_SENDBUF ), x_curs[P], MPI_CHAR, P, P, comm, &(x_send[P]) ) ;
  305. }
  306. }
  307. for ( P = 0 ; P < np ; P++ ) {
  308. if ( x_peermask[P] ) {
  309. MPI_Wait( &x_recv[P], &stat ) ;
  310. MPI_Wait( &x_send[P], &stat ) ;
  311. }
  312. }
  313. #else
  314. # ifndef MS_SUA
  315. fprintf(stderr,"RSL_LITE_SWAP disabled\n") ;
  316. # endif
  317. #endif
  318. for ( i = 0 ; i < np ; i++ ) { x_curs[i] = 0 ; }
  319. #endif
  320. }