PageRenderTime 44ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/src/sbcast/agent.c

https://github.com/cfenoy/slurm
C | 214 lines | 148 code | 24 blank | 42 comment | 21 complexity | b3ff50e9db99fcfd953134fa939e2f67 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * agent.c - File transfer agent (handles message traffic)
  3. *****************************************************************************
  4. * Copyright (C) 2006-2007 The Regents of the University of California.
  5. * Copyright (C) 2008-2009 Lawrence Livermore National Security.
  6. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  7. * Written by Morris Jette <jette1@llnl.gov>
  8. * CODE-OCEC-09-009. All rights reserved.
  9. *
  10. * This file is part of SLURM, a resource management program.
  11. * For details, see <http://www.schedmd.com/slurmdocs/>.
  12. * Please also read the included file: DISCLAIMER.
  13. *
  14. * SLURM is free software; you can redistribute it and/or modify it under
  15. * the terms of the GNU General Public License as published by the Free
  16. * Software Foundation; either version 2 of the License, or (at your option)
  17. * any later version.
  18. *
  19. * In addition, as a special exception, the copyright holders give permission
  20. * to link the code of portions of this program with the OpenSSL library under
  21. * certain conditions as described in each individual source file, and
  22. * distribute linked combinations including the two. You must obey the GNU
  23. * General Public License in all respects for all of the code used other than
  24. * OpenSSL. If you modify file(s) with this exception, you may extend this
  25. * exception to your version of the file(s), but you are not obligated to do
  26. * so. If you do not wish to do so, delete this exception statement from your
  27. * version. If you delete this exception statement from all source files in
  28. * the program, then also delete it here.
  29. *
  30. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  31. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  32. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  33. * details.
  34. *
  35. * You should have received a copy of the GNU General Public License along
  36. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  37. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  38. \*****************************************************************************/
  39. #if HAVE_CONFIG_H
  40. # include "config.h"
  41. #endif
  42. #include <errno.h>
  43. #include <fcntl.h>
  44. #include <pthread.h>
  45. #include <stdio.h>
  46. #include <stdlib.h>
  47. #include <unistd.h>
  48. #include "slurm/slurm_errno.h"
  49. #include "src/common/hostlist.h"
  50. #include "src/common/log.h"
  51. #include "src/common/macros.h"
  52. #include "src/common/read_config.h"
  53. #include "src/common/slurm_protocol_api.h"
  54. #include "src/common/slurm_protocol_defs.h"
  55. #include "src/common/slurm_protocol_interface.h"
  56. #include "src/common/xmalloc.h"
  57. #include "src/common/xstring.h"
  58. #include "src/common/forward.h"
  59. #include "src/sbcast/sbcast.h"
  60. #define MAX_RETRIES 10
  61. #define MAX_THREADS 8 /* These can be huge messages, so
  62. * only run MAX_THREADS at one time */
  63. typedef struct thd {
  64. pthread_t thread; /* thread ID */
  65. slurm_msg_t msg; /* message to send */
  66. int rc; /* highest return codes from RPC */
  67. char *nodelist;
  68. } thd_t;
  69. static pthread_mutex_t agent_cnt_mutex = PTHREAD_MUTEX_INITIALIZER;
  70. static pthread_cond_t agent_cnt_cond = PTHREAD_COND_INITIALIZER;
  71. static int agent_cnt = 0;
  72. static void *_agent_thread(void *args);
  73. static void *_agent_thread(void *args)
  74. {
  75. List ret_list = NULL;
  76. thd_t *thread_ptr = (thd_t *) args;
  77. ListIterator itr;
  78. ret_data_info_t *ret_data_info = NULL;
  79. int rc = 0, msg_rc;
  80. ret_list = slurm_send_recv_msgs(thread_ptr->nodelist,
  81. &thread_ptr->msg,
  82. params.timeout, false);
  83. if (ret_list == NULL) {
  84. error("slurm_send_recv_msgs: %m");
  85. exit(1);
  86. }
  87. itr = list_iterator_create(ret_list);
  88. while ((ret_data_info = list_next(itr))) {
  89. msg_rc = slurm_get_return_code(ret_data_info->type,
  90. ret_data_info->data);
  91. if (msg_rc == SLURM_SUCCESS)
  92. continue;
  93. error("REQUEST_FILE_BCAST(%s): %s",
  94. ret_data_info->node_name,
  95. slurm_strerror(msg_rc));
  96. rc = MAX(rc, msg_rc);
  97. }
  98. thread_ptr->rc = rc;
  99. list_iterator_destroy(itr);
  100. if (ret_list)
  101. list_destroy(ret_list);
  102. slurm_mutex_lock(&agent_cnt_mutex);
  103. agent_cnt--;
  104. pthread_cond_broadcast(&agent_cnt_cond);
  105. slurm_mutex_unlock(&agent_cnt_mutex);
  106. return NULL;
  107. }
  108. /* Issue the RPC to transfer the file's data */
  109. extern void send_rpc(file_bcast_msg_t *bcast_msg,
  110. job_sbcast_cred_msg_t *sbcast_cred)
  111. {
  112. /* Preserve some data structures across calls for better performance */
  113. static int threads_used = 0;
  114. static thd_t thread_info[MAX_THREADS];
  115. int i, fanout, rc = SLURM_SUCCESS;
  116. int retries = 0;
  117. pthread_attr_t attr;
  118. if (threads_used == 0) {
  119. hostlist_t hl;
  120. hostlist_t new_hl;
  121. int *span = NULL;
  122. char *name = NULL;
  123. if (params.fanout)
  124. fanout = MIN(MAX_THREADS, params.fanout);
  125. else
  126. fanout = MAX_THREADS;
  127. span = set_span(sbcast_cred->node_cnt, fanout);
  128. hl = hostlist_create(sbcast_cred->node_list);
  129. i = 0;
  130. while (i < sbcast_cred->node_cnt) {
  131. int j = 0;
  132. name = hostlist_shift(hl);
  133. if(!name) {
  134. debug3("no more nodes to send to");
  135. break;
  136. }
  137. new_hl = hostlist_create(name);
  138. free(name);
  139. i++;
  140. for(j = 0; j < span[threads_used]; j++) {
  141. name = hostlist_shift(hl);
  142. if(!name)
  143. break;
  144. hostlist_push(new_hl, name);
  145. free(name);
  146. i++;
  147. }
  148. thread_info[threads_used].nodelist =
  149. hostlist_ranged_string_xmalloc(new_hl);
  150. hostlist_destroy(new_hl);
  151. slurm_msg_t_init(&thread_info[threads_used].msg);
  152. thread_info[threads_used].msg.msg_type =
  153. REQUEST_FILE_BCAST;
  154. threads_used++;
  155. }
  156. xfree(span);
  157. hostlist_destroy(hl);
  158. debug("using %d threads", threads_used);
  159. }
  160. slurm_attr_init(&attr);
  161. if (pthread_attr_setstacksize(&attr, 3 * 1024*1024))
  162. error("pthread_attr_setstacksize: %m");
  163. if (pthread_attr_setdetachstate (&attr,
  164. PTHREAD_CREATE_DETACHED))
  165. error("pthread_attr_setdetachstate error %m");
  166. for (i=0; i<threads_used; i++) {
  167. thread_info[i].msg.data = bcast_msg;
  168. slurm_mutex_lock(&agent_cnt_mutex);
  169. agent_cnt++;
  170. slurm_mutex_unlock(&agent_cnt_mutex);
  171. while (pthread_create(&thread_info[i].thread,
  172. &attr, _agent_thread,
  173. (void *) &thread_info[i])) {
  174. error("pthread_create error %m");
  175. if (++retries > MAX_RETRIES)
  176. fatal("Can't create pthread");
  177. sleep(1); /* sleep and retry */
  178. }
  179. }
  180. /* wait until pthreads complete */
  181. slurm_mutex_lock(&agent_cnt_mutex);
  182. while (agent_cnt)
  183. pthread_cond_wait(&agent_cnt_cond, &agent_cnt_mutex);
  184. slurm_mutex_unlock(&agent_cnt_mutex);
  185. pthread_attr_destroy(&attr);
  186. for (i=0; i<threads_used; i++)
  187. rc = MAX(rc, thread_info[i].rc);
  188. if (rc)
  189. exit(1);
  190. }