PageRenderTime 63ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/src/slurmctld/agent.c

https://github.com/cfenoy/slurm
C | 1668 lines | 1324 code | 141 blank | 203 comment | 338 complexity | 0bdc7096f57b4059bc9d744bfb33fc6b MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

  1. /*****************************************************************************\
  2. * agent.c - parallel background communication functions. This is where
  3. * logic could be placed for broadcast communications.
  4. *****************************************************************************
  5. * Copyright (C) 2002-2007 The Regents of the University of California.
  6. * Copyright (C) 2008-2010 Lawrence Livermore National Security.
  7. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  8. * Written by Morris Jette <jette1@llnl.gov>, et. al.
  9. * Derived from pdsh written by Jim Garlick <garlick1@llnl.gov>
  10. * CODE-OCEC-09-009. All rights reserved.
  11. *
  12. * This file is part of SLURM, a resource management program.
  13. * For details, see <http://www.schedmd.com/slurmdocs/>.
  14. * Please also read the included file: DISCLAIMER.
  15. *
  16. * SLURM is free software; you can redistribute it and/or modify it under
  17. * the terms of the GNU General Public License as published by the Free
  18. * Software Foundation; either version 2 of the License, or (at your option)
  19. * any later version.
  20. *
  21. * In addition, as a special exception, the copyright holders give permission
  22. * to link the code of portions of this program with the OpenSSL library under
  23. * certain conditions as described in each individual source file, and
  24. * distribute linked combinations including the two. You must obey the GNU
  25. * General Public License in all respects for all of the code used other than
  26. * OpenSSL. If you modify file(s) with this exception, you may extend this
  27. * exception to your version of the file(s), but you are not obligated to do
  28. * so. If you do not wish to do so, delete this exception statement from your
  29. * version. If you delete this exception statement from all source files in
  30. * the program, then also delete it here.
  31. *
  32. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  33. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  34. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  35. * details.
  36. *
  37. * You should have received a copy of the GNU General Public License along
  38. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  39. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  40. *****************************************************************************
  41. * Theory of operation:
  42. *
  43. * The functions below permit slurm to initiate parallel tasks as a
  44. * detached thread and let the functions below make sure the work happens.
  45. * For example, when a job's time limit is to be changed slurmctld needs
  46. * to notify the slurmd on every node to which the job was allocated.
  47. * We don't want to hang slurmctld's primary function (the job update RPC)
  48. * to perform this work, so it just initiates an agent to perform the work.
  49. * The agent is passed all details required to perform the work, so it will
  50. * be possible to execute the agent as an pthread, process, or even a daemon
  51. * on some other computer.
  52. *
  53. * The main agent thread creates a separate thread for each node to be
  54. * communicated with up to AGENT_THREAD_COUNT. A special watchdog thread
  55. * sends SIGLARM to any threads that have been active (in DSH_ACTIVE state)
  56. * for more than COMMAND_TIMEOUT seconds.
  57. * The agent responds to slurmctld via a function call or an RPC as required.
  58. * For example, informing slurmctld that some node is not responding.
  59. *
  60. * All the state for each thread is maintained in thd_t struct, which is
  61. * used by the watchdog thread as well as the communication threads.
  62. \*****************************************************************************/
  63. #ifdef HAVE_CONFIG_H
  64. # include "config.h"
  65. #endif
  66. #include <errno.h>
  67. #include <pthread.h>
  68. #include <pwd.h>
  69. #include <signal.h>
  70. #include <string.h>
  71. #include <unistd.h>
  72. #include <sys/types.h>
  73. #include <sys/wait.h>
  74. #include <unistd.h>
  75. #include <stdlib.h>
  76. #include "src/common/forward.h"
  77. #include "src/common/list.h"
  78. #include "src/common/log.h"
  79. #include "src/common/macros.h"
  80. #include "src/common/node_select.h"
  81. #include "src/common/parse_time.h"
  82. #include "src/common/slurm_protocol_api.h"
  83. #include "src/common/slurm_protocol_interface.h"
  84. #include "src/common/uid.h"
  85. #include "src/common/xsignal.h"
  86. #include "src/common/xassert.h"
  87. #include "src/common/xmalloc.h"
  88. #include "src/common/xstring.h"
  89. #include "src/slurmctld/agent.h"
  90. #include "src/slurmctld/job_scheduler.h"
  91. #include "src/slurmctld/locks.h"
  92. #include "src/slurmctld/ping_nodes.h"
  93. #include "src/slurmctld/slurmctld.h"
  94. #include "src/slurmctld/state_save.h"
  95. #include "src/slurmctld/srun_comm.h"
  96. #define MAX_RETRIES 100
  97. typedef enum {
  98. DSH_NEW, /* Request not yet started */
  99. DSH_ACTIVE, /* Request in progress */
  100. DSH_DONE, /* Request completed normally */
  101. DSH_NO_RESP, /* Request timed out */
  102. DSH_FAILED /* Request resulted in error */
  103. } state_t;
  104. typedef struct thd_complete {
  105. bool work_done; /* assume all threads complete */
  106. int fail_cnt; /* assume no threads failures */
  107. int no_resp_cnt; /* assume all threads respond */
  108. int retry_cnt; /* assume no required retries */
  109. int max_delay;
  110. time_t now;
  111. } thd_complete_t;
  112. typedef struct thd {
  113. pthread_t thread; /* thread ID */
  114. pthread_attr_t attr; /* thread attributes */
  115. state_t state; /* thread state */
  116. time_t start_time; /* start time */
  117. time_t end_time; /* end time or delta time
  118. * upon termination */
  119. slurm_addr_t *addr; /* specific addr to send to
  120. * will not do nodelist if set */
  121. char *nodelist; /* list of nodes to send to */
  122. List ret_list;
  123. } thd_t;
  124. typedef struct agent_info {
  125. pthread_mutex_t thread_mutex; /* agent specific mutex */
  126. pthread_cond_t thread_cond; /* agent specific condition */
  127. uint32_t thread_count; /* number of threads records */
  128. uint32_t threads_active; /* currently active threads */
  129. uint16_t retry; /* if set, keep trying */
  130. thd_t *thread_struct; /* thread structures */
  131. bool get_reply; /* flag if reply expected */
  132. slurm_msg_type_t msg_type; /* RPC to be issued */
  133. void **msg_args_pptr; /* RPC data to be used */
  134. } agent_info_t;
  135. typedef struct task_info {
  136. pthread_mutex_t *thread_mutex_ptr; /* pointer to agent specific
  137. * mutex */
  138. pthread_cond_t *thread_cond_ptr;/* pointer to agent specific
  139. * condition */
  140. uint32_t *threads_active_ptr; /* currently active thread ptr */
  141. thd_t *thread_struct_ptr; /* thread structures ptr */
  142. bool get_reply; /* flag if reply expected */
  143. slurm_msg_type_t msg_type; /* RPC to be issued */
  144. void *msg_args_ptr; /* ptr to RPC data to be used */
  145. } task_info_t;
  146. typedef struct queued_request {
  147. agent_arg_t* agent_arg_ptr; /* The queued request */
  148. time_t first_attempt; /* Time of first check for batch
  149. * launch RPC *only* */
  150. time_t last_attempt; /* Time of last xmit attempt */
  151. } queued_request_t;
  152. typedef struct mail_info {
  153. char *user_name;
  154. char *message;
  155. } mail_info_t;
  156. static void _sig_handler(int dummy);
  157. static int _batch_launch_defer(queued_request_t *queued_req_ptr);
  158. static inline int _comm_err(char *node_name, slurm_msg_type_t msg_type);
  159. static void _list_delete_retry(void *retry_entry);
  160. static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr);
  161. static task_info_t *_make_task_data(agent_info_t *agent_info_ptr, int inx);
  162. static void _notify_slurmctld_jobs(agent_info_t *agent_ptr);
  163. static void _notify_slurmctld_nodes(agent_info_t *agent_ptr,
  164. int no_resp_cnt, int retry_cnt);
  165. static void _purge_agent_args(agent_arg_t *agent_arg_ptr);
  166. static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count);
  167. static int _setup_requeue(agent_arg_t *agent_arg_ptr, thd_t *thread_ptr,
  168. int count, int *spot);
  169. static void _spawn_retry_agent(agent_arg_t * agent_arg_ptr);
  170. static void *_thread_per_group_rpc(void *args);
  171. static int _valid_agent_arg(agent_arg_t *agent_arg_ptr);
  172. static void *_wdog(void *args);
  173. static mail_info_t *_mail_alloc(void);
  174. static void _mail_free(void *arg);
  175. static void _mail_proc(mail_info_t *mi);
  176. static char *_mail_type_str(uint16_t mail_type);
  177. static pthread_mutex_t retry_mutex = PTHREAD_MUTEX_INITIALIZER;
  178. static pthread_mutex_t mail_mutex = PTHREAD_MUTEX_INITIALIZER;
  179. static List retry_list = NULL; /* agent_arg_t list for retry */
  180. static List mail_list = NULL; /* pending e-mail requests */
  181. static pthread_mutex_t agent_cnt_mutex = PTHREAD_MUTEX_INITIALIZER;
  182. static pthread_cond_t agent_cnt_cond = PTHREAD_COND_INITIALIZER;
  183. static int agent_cnt = 0;
  184. static bool run_scheduler = false;
  185. static bool wiki2_sched = false;
  186. static bool wiki2_sched_test = false;
  187. /*
  188. * agent - party responsible for transmitting an common RPC in parallel
  189. * across a set of nodes. Use agent_queue_request() if immediate
  190. * execution is not essential.
  191. * IN pointer to agent_arg_t, which is xfree'd (including hostlist,
  192. * and msg_args) upon completion
  193. * RET always NULL (function format just for use as pthread)
  194. */
  195. void *agent(void *args)
  196. {
  197. int i, delay, rc, retries = 0;
  198. pthread_attr_t attr_wdog;
  199. pthread_t thread_wdog;
  200. agent_arg_t *agent_arg_ptr = args;
  201. agent_info_t *agent_info_ptr = NULL;
  202. thd_t *thread_ptr;
  203. task_info_t *task_specific_ptr;
  204. time_t begin_time;
  205. #if 0
  206. info("Agent_cnt is %d of %d with msg_type %d",
  207. agent_cnt, MAX_AGENT_CNT, agent_arg_ptr->msg_type);
  208. #endif
  209. slurm_mutex_lock(&agent_cnt_mutex);
  210. if (!wiki2_sched_test) {
  211. char *sched_type = slurm_get_sched_type();
  212. if (strcmp(sched_type, "sched/wiki2") == 0)
  213. wiki2_sched = true;
  214. xfree(sched_type);
  215. wiki2_sched_test = true;
  216. }
  217. while (1) {
  218. if (slurmctld_config.shutdown_time ||
  219. (agent_cnt < MAX_AGENT_CNT)) {
  220. agent_cnt++;
  221. break;
  222. } else { /* wait for state change and retry */
  223. pthread_cond_wait(&agent_cnt_cond, &agent_cnt_mutex);
  224. }
  225. }
  226. slurm_mutex_unlock(&agent_cnt_mutex);
  227. if (slurmctld_config.shutdown_time)
  228. goto cleanup;
  229. /* basic argument value tests */
  230. begin_time = time(NULL);
  231. if (_valid_agent_arg(agent_arg_ptr))
  232. goto cleanup;
  233. /* initialize the agent data structures */
  234. agent_info_ptr = _make_agent_info(agent_arg_ptr);
  235. thread_ptr = agent_info_ptr->thread_struct;
  236. /* start the watchdog thread */
  237. slurm_attr_init(&attr_wdog);
  238. if (pthread_attr_setdetachstate
  239. (&attr_wdog, PTHREAD_CREATE_JOINABLE))
  240. error("pthread_attr_setdetachstate error %m");
  241. while (pthread_create(&thread_wdog, &attr_wdog, _wdog,
  242. (void *) agent_info_ptr)) {
  243. error("pthread_create error %m");
  244. if (++retries > MAX_RETRIES)
  245. fatal("Can't create pthread");
  246. usleep(10000); /* sleep and retry */
  247. }
  248. slurm_attr_destroy(&attr_wdog);
  249. #if AGENT_THREAD_COUNT < 1
  250. fatal("AGENT_THREAD_COUNT value is invalid");
  251. #endif
  252. debug2("got %d threads to send out",agent_info_ptr->thread_count);
  253. /* start all the other threads (up to AGENT_THREAD_COUNT active) */
  254. for (i = 0; i < agent_info_ptr->thread_count; i++) {
  255. /* wait until "room" for another thread */
  256. slurm_mutex_lock(&agent_info_ptr->thread_mutex);
  257. while (agent_info_ptr->threads_active >=
  258. AGENT_THREAD_COUNT) {
  259. pthread_cond_wait(&agent_info_ptr->thread_cond,
  260. &agent_info_ptr->thread_mutex);
  261. }
  262. /* create thread specific data, NOTE: freed from
  263. * _thread_per_group_rpc() */
  264. task_specific_ptr = _make_task_data(agent_info_ptr, i);
  265. slurm_attr_init(&thread_ptr[i].attr);
  266. if (pthread_attr_setdetachstate(&thread_ptr[i].attr,
  267. PTHREAD_CREATE_DETACHED))
  268. error("pthread_attr_setdetachstate error %m");
  269. while ((rc = pthread_create(&thread_ptr[i].thread,
  270. &thread_ptr[i].attr,
  271. _thread_per_group_rpc,
  272. (void *) task_specific_ptr))) {
  273. error("pthread_create error %m");
  274. if (agent_info_ptr->threads_active)
  275. pthread_cond_wait(&agent_info_ptr->
  276. thread_cond,
  277. &agent_info_ptr->
  278. thread_mutex);
  279. else {
  280. slurm_mutex_unlock(&agent_info_ptr->
  281. thread_mutex);
  282. usleep(10000); /* sleep and retry */
  283. slurm_mutex_lock(&agent_info_ptr->
  284. thread_mutex);
  285. }
  286. }
  287. slurm_attr_destroy(&thread_ptr[i].attr);
  288. agent_info_ptr->threads_active++;
  289. slurm_mutex_unlock(&agent_info_ptr->thread_mutex);
  290. }
  291. /* wait for termination of remaining threads */
  292. pthread_join(thread_wdog, NULL);
  293. delay = (int) difftime(time(NULL), begin_time);
  294. if (delay > (slurm_get_msg_timeout() * 2)) {
  295. info("agent msg_type=%u ran for %d seconds",
  296. agent_arg_ptr->msg_type, delay);
  297. }
  298. slurm_mutex_lock(&agent_info_ptr->thread_mutex);
  299. while (agent_info_ptr->threads_active != 0) {
  300. pthread_cond_wait(&agent_info_ptr->thread_cond,
  301. &agent_info_ptr->thread_mutex);
  302. }
  303. slurm_mutex_unlock(&agent_info_ptr->thread_mutex);
  304. cleanup:
  305. _purge_agent_args(agent_arg_ptr);
  306. if (agent_info_ptr) {
  307. xfree(agent_info_ptr->thread_struct);
  308. xfree(agent_info_ptr);
  309. }
  310. slurm_mutex_lock(&agent_cnt_mutex);
  311. if (agent_cnt > 0)
  312. agent_cnt--;
  313. else {
  314. error("agent_cnt underflow");
  315. agent_cnt = 0;
  316. }
  317. if (agent_cnt && agent_cnt < MAX_AGENT_CNT)
  318. agent_retry(RPC_RETRY_INTERVAL, true);
  319. pthread_cond_broadcast(&agent_cnt_cond);
  320. slurm_mutex_unlock(&agent_cnt_mutex);
  321. return NULL;
  322. }
  323. /* Basic validity test of agent argument */
  324. static int _valid_agent_arg(agent_arg_t *agent_arg_ptr)
  325. {
  326. xassert(agent_arg_ptr);
  327. xassert(agent_arg_ptr->hostlist);
  328. if (agent_arg_ptr->node_count == 0)
  329. return SLURM_FAILURE; /* no messages to be sent */
  330. if (agent_arg_ptr->node_count
  331. != hostlist_count(agent_arg_ptr->hostlist)) {
  332. error("you said you were going to send to %d "
  333. "hosts but I only have %d",
  334. agent_arg_ptr->node_count,
  335. hostlist_count(agent_arg_ptr->hostlist));
  336. return SLURM_FAILURE; /* no messages to be sent */
  337. }
  338. return SLURM_SUCCESS;
  339. }
  340. static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr)
  341. {
  342. int i = 0, j = 0;
  343. agent_info_t *agent_info_ptr = NULL;
  344. thd_t *thread_ptr = NULL;
  345. int *span = NULL;
  346. int thr_count = 0;
  347. hostlist_t hl = NULL;
  348. char *name = NULL;
  349. agent_info_ptr = xmalloc(sizeof(agent_info_t));
  350. slurm_mutex_init(&agent_info_ptr->thread_mutex);
  351. if (pthread_cond_init(&agent_info_ptr->thread_cond, NULL))
  352. fatal("pthread_cond_init error %m");
  353. agent_info_ptr->thread_count = agent_arg_ptr->node_count;
  354. agent_info_ptr->retry = agent_arg_ptr->retry;
  355. agent_info_ptr->threads_active = 0;
  356. thread_ptr = xmalloc(agent_info_ptr->thread_count * sizeof(thd_t));
  357. memset(thread_ptr, 0, (agent_info_ptr->thread_count * sizeof(thd_t)));
  358. agent_info_ptr->thread_struct = thread_ptr;
  359. agent_info_ptr->msg_type = agent_arg_ptr->msg_type;
  360. agent_info_ptr->msg_args_pptr = &agent_arg_ptr->msg_args;
  361. if ((agent_arg_ptr->msg_type != REQUEST_JOB_NOTIFY) &&
  362. (agent_arg_ptr->msg_type != REQUEST_REBOOT_NODES) &&
  363. (agent_arg_ptr->msg_type != REQUEST_RECONFIGURE) &&
  364. (agent_arg_ptr->msg_type != REQUEST_SHUTDOWN) &&
  365. (agent_arg_ptr->msg_type != SRUN_EXEC) &&
  366. (agent_arg_ptr->msg_type != SRUN_TIMEOUT) &&
  367. (agent_arg_ptr->msg_type != SRUN_NODE_FAIL) &&
  368. (agent_arg_ptr->msg_type != SRUN_REQUEST_SUSPEND) &&
  369. (agent_arg_ptr->msg_type != SRUN_USER_MSG) &&
  370. (agent_arg_ptr->msg_type != SRUN_STEP_MISSING) &&
  371. (agent_arg_ptr->msg_type != SRUN_STEP_SIGNAL) &&
  372. (agent_arg_ptr->msg_type != SRUN_JOB_COMPLETE)) {
  373. #ifdef HAVE_FRONT_END
  374. span = set_span(agent_arg_ptr->node_count,
  375. agent_arg_ptr->node_count);
  376. #else
  377. /* Sending message to a possibly large number of slurmd.
  378. * Push all message forwarding to slurmd in order to
  379. * offload as much work from slurmctld as possible. */
  380. span = set_span(agent_arg_ptr->node_count, 1);
  381. #endif
  382. agent_info_ptr->get_reply = true;
  383. } else {
  384. /* Message is going to one node (for srun) or we want
  385. * it to get processed ASAP (SHUTDOWN or RECONFIGURE).
  386. * Send the message directly to each node. */
  387. span = set_span(agent_arg_ptr->node_count,
  388. agent_arg_ptr->node_count);
  389. }
  390. i = 0;
  391. while(i < agent_info_ptr->thread_count) {
  392. thread_ptr[thr_count].state = DSH_NEW;
  393. thread_ptr[thr_count].addr = agent_arg_ptr->addr;
  394. name = hostlist_shift(agent_arg_ptr->hostlist);
  395. if(!name) {
  396. debug3("no more nodes to send to");
  397. break;
  398. }
  399. hl = hostlist_create(name);
  400. if(thread_ptr[thr_count].addr && span[thr_count]) {
  401. debug("warning: you will only be sending this to %s",
  402. name);
  403. span[thr_count] = 0;
  404. }
  405. free(name);
  406. i++;
  407. for (j = 0; j < span[thr_count]; j++) {
  408. name = hostlist_shift(agent_arg_ptr->hostlist);
  409. if (!name)
  410. break;
  411. hostlist_push(hl, name);
  412. free(name);
  413. i++;
  414. }
  415. hostlist_uniq(hl);
  416. thread_ptr[thr_count].nodelist =
  417. hostlist_ranged_string_xmalloc(hl);
  418. hostlist_destroy(hl);
  419. #if 0
  420. info("sending msg_type %u to nodes %s",
  421. agent_arg_ptr->msg_type, thread_ptr[thr_count].nodelist);
  422. #endif
  423. thr_count++;
  424. }
  425. xfree(span);
  426. agent_info_ptr->thread_count = thr_count;
  427. return agent_info_ptr;
  428. }
  429. static task_info_t *_make_task_data(agent_info_t *agent_info_ptr, int inx)
  430. {
  431. task_info_t *task_info_ptr;
  432. task_info_ptr = xmalloc(sizeof(task_info_t));
  433. task_info_ptr->thread_mutex_ptr = &agent_info_ptr->thread_mutex;
  434. task_info_ptr->thread_cond_ptr = &agent_info_ptr->thread_cond;
  435. task_info_ptr->threads_active_ptr= &agent_info_ptr->threads_active;
  436. task_info_ptr->thread_struct_ptr = &agent_info_ptr->thread_struct[inx];
  437. task_info_ptr->get_reply = agent_info_ptr->get_reply;
  438. task_info_ptr->msg_type = agent_info_ptr->msg_type;
  439. task_info_ptr->msg_args_ptr = *agent_info_ptr->msg_args_pptr;
  440. return task_info_ptr;
  441. }
  442. static void _update_wdog_state(thd_t *thread_ptr,
  443. state_t *state,
  444. thd_complete_t *thd_comp)
  445. {
  446. switch(*state) {
  447. case DSH_ACTIVE:
  448. thd_comp->work_done = false;
  449. if (thread_ptr->end_time <= thd_comp->now) {
  450. debug3("agent thread %lu timed out",
  451. (unsigned long) thread_ptr->thread);
  452. if (pthread_kill(thread_ptr->thread, SIGUSR1) == ESRCH)
  453. *state = DSH_NO_RESP;
  454. else
  455. thread_ptr->end_time += COMMAND_TIMEOUT;
  456. }
  457. break;
  458. case DSH_NEW:
  459. thd_comp->work_done = false;
  460. break;
  461. case DSH_DONE:
  462. if (thd_comp->max_delay < (int)thread_ptr->end_time)
  463. thd_comp->max_delay = (int)thread_ptr->end_time;
  464. break;
  465. case DSH_NO_RESP:
  466. thd_comp->no_resp_cnt++;
  467. thd_comp->retry_cnt++;
  468. break;
  469. case DSH_FAILED:
  470. thd_comp->fail_cnt++;
  471. break;
  472. }
  473. }
  474. /*
  475. * _wdog - Watchdog thread. Send SIGUSR1 to threads which have been active
  476. * for too long.
  477. * IN args - pointer to agent_info_t with info on threads to watch
  478. * Sleep between polls with exponential times (from 0.125 to 1.0 second)
  479. */
  480. static void *_wdog(void *args)
  481. {
  482. bool srun_agent = false;
  483. int i;
  484. agent_info_t *agent_ptr = (agent_info_t *) args;
  485. thd_t *thread_ptr = agent_ptr->thread_struct;
  486. unsigned long usec = 5000;
  487. ListIterator itr;
  488. thd_complete_t thd_comp;
  489. ret_data_info_t *ret_data_info = NULL;
  490. if ( (agent_ptr->msg_type == SRUN_JOB_COMPLETE) ||
  491. (agent_ptr->msg_type == SRUN_STEP_MISSING) ||
  492. (agent_ptr->msg_type == SRUN_STEP_SIGNAL) ||
  493. (agent_ptr->msg_type == SRUN_EXEC) ||
  494. (agent_ptr->msg_type == SRUN_NODE_FAIL) ||
  495. (agent_ptr->msg_type == SRUN_PING) ||
  496. (agent_ptr->msg_type == SRUN_TIMEOUT) ||
  497. (agent_ptr->msg_type == SRUN_USER_MSG) ||
  498. (agent_ptr->msg_type == RESPONSE_RESOURCE_ALLOCATION) )
  499. srun_agent = true;
  500. thd_comp.max_delay = 0;
  501. while (1) {
  502. thd_comp.work_done = true;/* assume all threads complete */
  503. thd_comp.fail_cnt = 0; /* assume no threads failures */
  504. thd_comp.no_resp_cnt = 0; /* assume all threads respond */
  505. thd_comp.retry_cnt = 0; /* assume no required retries */
  506. thd_comp.now = time(NULL);
  507. usleep(usec);
  508. usec = MIN((usec * 2), 1000000);
  509. slurm_mutex_lock(&agent_ptr->thread_mutex);
  510. for (i = 0; i < agent_ptr->thread_count; i++) {
  511. //info("thread name %s",thread_ptr[i].node_name);
  512. if (!thread_ptr[i].ret_list) {
  513. _update_wdog_state(&thread_ptr[i],
  514. &thread_ptr[i].state,
  515. &thd_comp);
  516. } else {
  517. itr = list_iterator_create(
  518. thread_ptr[i].ret_list);
  519. while((ret_data_info = list_next(itr))) {
  520. _update_wdog_state(&thread_ptr[i],
  521. &ret_data_info->err,
  522. &thd_comp);
  523. }
  524. list_iterator_destroy(itr);
  525. }
  526. }
  527. if (thd_comp.work_done)
  528. break;
  529. slurm_mutex_unlock(&agent_ptr->thread_mutex);
  530. }
  531. if (srun_agent) {
  532. _notify_slurmctld_jobs(agent_ptr);
  533. } else {
  534. _notify_slurmctld_nodes(agent_ptr,
  535. thd_comp.no_resp_cnt,
  536. thd_comp.retry_cnt);
  537. }
  538. for (i = 0; i < agent_ptr->thread_count; i++) {
  539. if (thread_ptr[i].ret_list)
  540. list_destroy(thread_ptr[i].ret_list);
  541. xfree(thread_ptr[i].nodelist);
  542. }
  543. if (thd_comp.max_delay)
  544. debug2("agent maximum delay %d seconds", thd_comp.max_delay);
  545. slurm_mutex_unlock(&agent_ptr->thread_mutex);
  546. return (void *) NULL;
  547. }
  548. static void _notify_slurmctld_jobs(agent_info_t *agent_ptr)
  549. {
  550. /* Locks: Write job */
  551. slurmctld_lock_t job_write_lock =
  552. { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
  553. uint32_t job_id = 0, step_id = 0;
  554. thd_t *thread_ptr = agent_ptr->thread_struct;
  555. if (agent_ptr->msg_type == SRUN_PING) {
  556. srun_ping_msg_t *msg = *agent_ptr->msg_args_pptr;
  557. job_id = msg->job_id;
  558. step_id = msg->step_id;
  559. } else if (agent_ptr->msg_type == SRUN_TIMEOUT) {
  560. srun_timeout_msg_t *msg = *agent_ptr->msg_args_pptr;
  561. job_id = msg->job_id;
  562. step_id = msg->step_id;
  563. } else if (agent_ptr->msg_type == RESPONSE_RESOURCE_ALLOCATION) {
  564. resource_allocation_response_msg_t *msg =
  565. *agent_ptr->msg_args_pptr;
  566. job_id = msg->job_id;
  567. step_id = NO_VAL;
  568. } else if ((agent_ptr->msg_type == SRUN_JOB_COMPLETE) ||
  569. (agent_ptr->msg_type == SRUN_STEP_MISSING) ||
  570. (agent_ptr->msg_type == SRUN_STEP_SIGNAL) ||
  571. (agent_ptr->msg_type == SRUN_EXEC) ||
  572. (agent_ptr->msg_type == SRUN_USER_MSG)) {
  573. return; /* no need to note srun response */
  574. } else if (agent_ptr->msg_type == SRUN_NODE_FAIL) {
  575. return; /* no need to note srun response */
  576. } else {
  577. error("_notify_slurmctld_jobs invalid msg_type %u",
  578. agent_ptr->msg_type);
  579. return;
  580. }
  581. lock_slurmctld(job_write_lock);
  582. if (thread_ptr[0].state == DSH_DONE) {
  583. srun_response(job_id, step_id);
  584. }
  585. unlock_slurmctld(job_write_lock);
  586. }
  587. static void _notify_slurmctld_nodes(agent_info_t *agent_ptr,
  588. int no_resp_cnt, int retry_cnt)
  589. {
  590. ListIterator itr = NULL;
  591. ret_data_info_t *ret_data_info = NULL;
  592. state_t state;
  593. int is_ret_list = 1;
  594. /* Locks: Read config, write job, write node */
  595. slurmctld_lock_t node_write_lock =
  596. { READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
  597. thd_t *thread_ptr = agent_ptr->thread_struct;
  598. int i;
  599. /* Notify slurmctld of non-responding nodes */
  600. if (no_resp_cnt) {
  601. /* Update node table data for non-responding nodes */
  602. lock_slurmctld(node_write_lock);
  603. if (agent_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH) {
  604. /* Requeue the request */
  605. batch_job_launch_msg_t *launch_msg_ptr =
  606. *agent_ptr->msg_args_pptr;
  607. uint32_t job_id = launch_msg_ptr->job_id;
  608. job_complete(job_id, 0, true, false, 0);
  609. }
  610. unlock_slurmctld(node_write_lock);
  611. }
  612. if (retry_cnt && agent_ptr->retry)
  613. _queue_agent_retry(agent_ptr, retry_cnt);
  614. /* Update last_response on responding nodes */
  615. lock_slurmctld(node_write_lock);
  616. for (i = 0; i < agent_ptr->thread_count; i++) {
  617. char *down_msg, *node_names;
  618. if (!thread_ptr[i].ret_list) {
  619. state = thread_ptr[i].state;
  620. is_ret_list = 0;
  621. goto switch_on_state;
  622. }
  623. is_ret_list = 1;
  624. itr = list_iterator_create(thread_ptr[i].ret_list);
  625. while ((ret_data_info = list_next(itr))) {
  626. state = ret_data_info->err;
  627. switch_on_state:
  628. switch(state) {
  629. case DSH_NO_RESP:
  630. if (!is_ret_list) {
  631. node_not_resp(thread_ptr[i].nodelist,
  632. thread_ptr[i].
  633. start_time);
  634. } else {
  635. node_not_resp(ret_data_info->node_name,
  636. thread_ptr[i].start_time);
  637. }
  638. break;
  639. case DSH_FAILED:
  640. if (is_ret_list)
  641. node_names = ret_data_info->node_name;
  642. else
  643. node_names = thread_ptr[i].nodelist;
  644. #ifdef HAVE_FRONT_END
  645. down_msg = "";
  646. #else
  647. set_node_down(node_names,
  648. "Prolog/Epilog failure");
  649. down_msg = ", set to state DOWN";
  650. #endif
  651. error("Prolog/Epilog failure on nodes %s%s",
  652. node_names, down_msg);
  653. break;
  654. case DSH_DONE:
  655. if (!is_ret_list)
  656. node_did_resp(thread_ptr[i].nodelist);
  657. else
  658. node_did_resp(ret_data_info->node_name);
  659. break;
  660. default:
  661. if (!is_ret_list) {
  662. error("unknown state returned for %s",
  663. thread_ptr[i].nodelist);
  664. } else {
  665. error("unknown state returned for %s",
  666. ret_data_info->node_name);
  667. }
  668. break;
  669. }
  670. if (!is_ret_list)
  671. goto finished;
  672. }
  673. list_iterator_destroy(itr);
  674. finished: ;
  675. }
  676. unlock_slurmctld(node_write_lock);
  677. if (run_scheduler) {
  678. run_scheduler = false;
  679. /* below functions all have their own locking */
  680. if (schedule(0)) {
  681. schedule_job_save();
  682. schedule_node_save();
  683. }
  684. }
  685. if ((agent_ptr->msg_type == REQUEST_PING) ||
  686. (agent_ptr->msg_type == REQUEST_HEALTH_CHECK) ||
  687. (agent_ptr->msg_type == REQUEST_ACCT_GATHER_UPDATE) ||
  688. (agent_ptr->msg_type == REQUEST_NODE_REGISTRATION_STATUS))
  689. ping_end();
  690. }
  691. /* Report a communications error for specified node
  692. * This also gets logged as a non-responsive node */
  693. static inline int _comm_err(char *node_name, slurm_msg_type_t msg_type)
  694. {
  695. int rc = 1;
  696. if ((rc = is_node_resp (node_name)))
  697. verbose("agent/is_node_resp: node:%s rpc:%d : %m",
  698. node_name, msg_type);
  699. return rc;
  700. }
  701. /* return a value for wihc WEXITSTATUS returns 1 */
  702. static int _wif_status(void)
  703. {
  704. static int rc = 0;
  705. int i;
  706. if (rc)
  707. return rc;
  708. rc = 1;
  709. for (i=0; i<64; i++) {
  710. if (WEXITSTATUS(rc))
  711. return rc;
  712. rc = rc << 1;
  713. }
  714. error("Could not identify WEXITSTATUS");
  715. rc = 1;
  716. return rc;
  717. }
  718. /*
  719. * _thread_per_group_rpc - thread to issue an RPC for a group of nodes
  720. * sending message out to one and forwarding it to
  721. * others if necessary.
  722. * IN/OUT args - pointer to task_info_t, xfree'd on completion
  723. */
  724. static void *_thread_per_group_rpc(void *args)
  725. {
  726. int rc = SLURM_SUCCESS;
  727. slurm_msg_t msg;
  728. task_info_t *task_ptr = (task_info_t *) args;
  729. /* we cache some pointers from task_info_t because we need
  730. * to xfree args before being finished with their use. xfree
  731. * is required for timely termination of this pthread because
  732. * xfree could lock it at the end, preventing a timely
  733. * thread_exit */
  734. pthread_mutex_t *thread_mutex_ptr = task_ptr->thread_mutex_ptr;
  735. pthread_cond_t *thread_cond_ptr = task_ptr->thread_cond_ptr;
  736. uint32_t *threads_active_ptr = task_ptr->threads_active_ptr;
  737. thd_t *thread_ptr = task_ptr->thread_struct_ptr;
  738. state_t thread_state = DSH_NO_RESP;
  739. slurm_msg_type_t msg_type = task_ptr->msg_type;
  740. bool is_kill_msg, srun_agent;
  741. List ret_list = NULL;
  742. ListIterator itr;
  743. ret_data_info_t *ret_data_info = NULL;
  744. int sig_array[2] = {SIGUSR1, 0};
  745. /* Locks: Write job, write node */
  746. slurmctld_lock_t job_write_lock = {
  747. NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };
  748. /* Lock: Read node */
  749. slurmctld_lock_t node_read_lock = {
  750. NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK };
  751. /* Lock: Write node */
  752. slurmctld_lock_t node_write_lock = {
  753. NO_LOCK, NO_LOCK, WRITE_LOCK, NO_LOCK };
  754. xassert(args != NULL);
  755. xsignal(SIGUSR1, _sig_handler);
  756. xsignal_unblock(sig_array);
  757. is_kill_msg = ( (msg_type == REQUEST_KILL_TIMELIMIT) ||
  758. (msg_type == REQUEST_KILL_PREEMPTED) ||
  759. (msg_type == REQUEST_TERMINATE_JOB) );
  760. srun_agent = ( (msg_type == SRUN_PING) ||
  761. (msg_type == SRUN_EXEC) ||
  762. (msg_type == SRUN_JOB_COMPLETE) ||
  763. (msg_type == SRUN_STEP_MISSING) ||
  764. (msg_type == SRUN_STEP_SIGNAL) ||
  765. (msg_type == SRUN_TIMEOUT) ||
  766. (msg_type == SRUN_USER_MSG) ||
  767. (msg_type == RESPONSE_RESOURCE_ALLOCATION) ||
  768. (msg_type == SRUN_NODE_FAIL) );
  769. thread_ptr->start_time = time(NULL);
  770. slurm_mutex_lock(thread_mutex_ptr);
  771. thread_ptr->state = DSH_ACTIVE;
  772. thread_ptr->end_time = thread_ptr->start_time + COMMAND_TIMEOUT;
  773. slurm_mutex_unlock(thread_mutex_ptr);
  774. /* send request message */
  775. slurm_msg_t_init(&msg);
  776. msg.msg_type = msg_type;
  777. msg.data = task_ptr->msg_args_ptr;
  778. #if 0
  779. info("sending message type %u to %s", msg_type, thread_ptr->nodelist);
  780. #endif
  781. if (task_ptr->get_reply) {
  782. if (thread_ptr->addr) {
  783. msg.address = *thread_ptr->addr;
  784. if (!(ret_list = slurm_send_addr_recv_msgs(
  785. &msg, thread_ptr->nodelist, 0))) {
  786. error("_thread_per_group_rpc: "
  787. "no ret_list given");
  788. goto cleanup;
  789. }
  790. } else {
  791. if (!(ret_list = slurm_send_recv_msgs(
  792. thread_ptr->nodelist,
  793. &msg, 0, true))) {
  794. error("_thread_per_group_rpc: "
  795. "no ret_list given");
  796. goto cleanup;
  797. }
  798. }
  799. } else {
  800. if (thread_ptr->addr) {
  801. //info("got the address");
  802. msg.address = *thread_ptr->addr;
  803. } else {
  804. //info("no address given");
  805. if(slurm_conf_get_addr(thread_ptr->nodelist,
  806. &msg.address) == SLURM_ERROR) {
  807. error("_thread_per_group_rpc: "
  808. "can't find address for host %s, "
  809. "check slurm.conf",
  810. thread_ptr->nodelist);
  811. goto cleanup;
  812. }
  813. }
  814. //info("sending %u to %s", msg_type, thread_ptr->nodelist);
  815. if (slurm_send_only_node_msg(&msg) == SLURM_SUCCESS) {
  816. thread_state = DSH_DONE;
  817. } else {
  818. if (!srun_agent) {
  819. lock_slurmctld(node_read_lock);
  820. _comm_err(thread_ptr->nodelist, msg_type);
  821. unlock_slurmctld(node_read_lock);
  822. }
  823. }
  824. goto cleanup;
  825. }
  826. //info("got %d messages back", list_count(ret_list));
  827. itr = list_iterator_create(ret_list);
  828. while ((ret_data_info = list_next(itr)) != NULL) {
  829. rc = slurm_get_return_code(ret_data_info->type,
  830. ret_data_info->data);
  831. /* SPECIAL CASE: Record node's CPU load */
  832. if (ret_data_info->type == RESPONSE_PING_SLURMD) {
  833. ping_slurmd_resp_msg_t *ping_resp;
  834. ping_resp = (ping_slurmd_resp_msg_t *)
  835. ret_data_info->data;
  836. lock_slurmctld(node_write_lock);
  837. reset_node_load(ret_data_info->node_name,
  838. ping_resp->cpu_load);
  839. unlock_slurmctld(node_write_lock);
  840. }
  841. /* SPECIAL CASE: Mark node as IDLE if job already complete */
  842. if (is_kill_msg &&
  843. (rc == ESLURMD_KILL_JOB_ALREADY_COMPLETE)) {
  844. kill_job_msg_t *kill_job;
  845. kill_job = (kill_job_msg_t *)
  846. task_ptr->msg_args_ptr;
  847. rc = SLURM_SUCCESS;
  848. lock_slurmctld(job_write_lock);
  849. if (job_epilog_complete(kill_job->job_id,
  850. ret_data_info->
  851. node_name,
  852. rc))
  853. run_scheduler = true;
  854. unlock_slurmctld(job_write_lock);
  855. }
  856. /* SPECIAL CASE: Record node's CPU load */
  857. if (ret_data_info->type == RESPONSE_ACCT_GATHER_UPDATE) {
  858. lock_slurmctld(node_write_lock);
  859. update_node_record_acct_gather_data(
  860. ret_data_info->data);
  861. unlock_slurmctld(node_write_lock);
  862. }
  863. /* SPECIAL CASE: Kill non-startable batch job,
  864. * Requeue the job on ESLURMD_PROLOG_FAILED */
  865. if ((msg_type == REQUEST_BATCH_JOB_LAUNCH) &&
  866. (rc != SLURM_SUCCESS) && (rc != ESLURMD_PROLOG_FAILED) &&
  867. (ret_data_info->type != RESPONSE_FORWARD_FAILED)) {
  868. batch_job_launch_msg_t *launch_msg_ptr =
  869. task_ptr->msg_args_ptr;
  870. uint32_t job_id = launch_msg_ptr->job_id;
  871. info("Killing non-startable batch job %u: %s",
  872. job_id, slurm_strerror(rc));
  873. thread_state = DSH_DONE;
  874. ret_data_info->err = thread_state;
  875. lock_slurmctld(job_write_lock);
  876. job_complete(job_id, 0, false, false, _wif_status());
  877. unlock_slurmctld(job_write_lock);
  878. continue;
  879. }
  880. if (((msg_type == REQUEST_SIGNAL_TASKS) ||
  881. (msg_type == REQUEST_TERMINATE_TASKS)) &&
  882. (rc == ESRCH)) {
  883. /* process is already dead, not a real error */
  884. rc = SLURM_SUCCESS;
  885. }
  886. switch (rc) {
  887. case SLURM_SUCCESS:
  888. /* debug("agent processed RPC to node %s", */
  889. /* ret_data_info->node_name); */
  890. thread_state = DSH_DONE;
  891. break;
  892. case SLURM_UNKNOWN_FORWARD_ADDR:
  893. error("We were unable to forward message to '%s'. "
  894. "Make sure the slurm.conf for each slurmd "
  895. "contain all other nodes in your system.",
  896. ret_data_info->node_name);
  897. thread_state = DSH_NO_RESP;
  898. break;
  899. case ESLURMD_EPILOG_FAILED:
  900. error("Epilog failure on host %s, "
  901. "setting DOWN",
  902. ret_data_info->node_name);
  903. thread_state = DSH_FAILED;
  904. break;
  905. case ESLURMD_PROLOG_FAILED:
  906. thread_state = DSH_FAILED;
  907. break;
  908. case ESLURM_INVALID_JOB_ID:
  909. /* Not indicative of a real error */
  910. case ESLURMD_JOB_NOTRUNNING:
  911. /* Not indicative of a real error */
  912. debug2("agent processed RPC to node %s: %s",
  913. ret_data_info->node_name,
  914. slurm_strerror(rc));
  915. thread_state = DSH_DONE;
  916. break;
  917. default:
  918. if (!srun_agent) {
  919. if (ret_data_info->err)
  920. errno = ret_data_info->err;
  921. else
  922. errno = rc;
  923. lock_slurmctld(node_read_lock);
  924. rc = _comm_err(ret_data_info->node_name,
  925. msg_type);
  926. unlock_slurmctld(node_read_lock);
  927. }
  928. if (srun_agent)
  929. thread_state = DSH_FAILED;
  930. else if(ret_data_info->type == RESPONSE_FORWARD_FAILED)
  931. /* check if a forward failed */
  932. thread_state = DSH_NO_RESP;
  933. else { /* some will fail that don't mean anything went
  934. * bad like a job term request on a job that is
  935. * already finished, we will just exit on those
  936. * cases */
  937. thread_state = DSH_DONE;
  938. }
  939. }
  940. ret_data_info->err = thread_state;
  941. }
  942. list_iterator_destroy(itr);
  943. cleanup:
  944. xfree(args);
  945. /* handled at end of thread just in case resend is needed */
  946. destroy_forward(&msg.forward);
  947. slurm_mutex_lock(thread_mutex_ptr);
  948. thread_ptr->ret_list = ret_list;
  949. thread_ptr->state = thread_state;
  950. thread_ptr->end_time = (time_t) difftime(time(NULL),
  951. thread_ptr->start_time);
  952. /* Signal completion so another thread can replace us */
  953. (*threads_active_ptr)--;
  954. pthread_cond_signal(thread_cond_ptr);
  955. slurm_mutex_unlock(thread_mutex_ptr);
  956. return (void *) NULL;
  957. }
  958. /*
  959. * Signal handler. We are really interested in interrupting hung communictions
  960. * and causing them to return EINTR. Multiple interupts might be required.
  961. */
  962. static void _sig_handler(int dummy)
  963. {
  964. }
  965. static int _setup_requeue(agent_arg_t *agent_arg_ptr, thd_t *thread_ptr,
  966. int count, int *spot)
  967. {
  968. ret_data_info_t *ret_data_info = NULL;
  969. ListIterator itr = list_iterator_create(thread_ptr->ret_list);
  970. while((ret_data_info = list_next(itr))) {
  971. debug2("got err of %d", ret_data_info->err);
  972. if (ret_data_info->err != DSH_NO_RESP)
  973. continue;
  974. debug("got the name %s to resend out of %d",
  975. ret_data_info->node_name, count);
  976. if(agent_arg_ptr) {
  977. hostlist_push(agent_arg_ptr->hostlist,
  978. ret_data_info->node_name);
  979. if ((++(*spot)) == count) {
  980. list_iterator_destroy(itr);
  981. return 1;
  982. }
  983. }
  984. }
  985. list_iterator_destroy(itr);
  986. return 0;
  987. }
  988. /*
  989. * _queue_agent_retry - Queue any failed RPCs for later replay
  990. * IN agent_info_ptr - pointer to info on completed agent requests
  991. * IN count - number of agent requests which failed, count to requeue
  992. */
  993. static void _queue_agent_retry(agent_info_t * agent_info_ptr, int count)
  994. {
  995. agent_arg_t *agent_arg_ptr;
  996. queued_request_t *queued_req_ptr = NULL;
  997. thd_t *thread_ptr = agent_info_ptr->thread_struct;
  998. int i, j;
  999. if (count == 0)
  1000. return;
  1001. /* build agent argument with just the RPCs to retry */
  1002. agent_arg_ptr = xmalloc(sizeof(agent_arg_t));
  1003. agent_arg_ptr->node_count = count;
  1004. agent_arg_ptr->retry = 1;
  1005. agent_arg_ptr->hostlist = hostlist_create("");
  1006. agent_arg_ptr->msg_type = agent_info_ptr->msg_type;
  1007. agent_arg_ptr->msg_args = *(agent_info_ptr->msg_args_pptr);
  1008. *(agent_info_ptr->msg_args_pptr) = NULL;
  1009. j = 0;
  1010. for (i = 0; i < agent_info_ptr->thread_count; i++) {
  1011. if(!thread_ptr[i].ret_list) {
  1012. if (thread_ptr[i].state != DSH_NO_RESP)
  1013. continue;
  1014. debug("got the name %s to resend",
  1015. thread_ptr[i].nodelist);
  1016. hostlist_push(agent_arg_ptr->hostlist,
  1017. thread_ptr[i].nodelist);
  1018. if ((++j) == count)
  1019. break;
  1020. } else {
  1021. if(_setup_requeue(agent_arg_ptr, &thread_ptr[i],
  1022. count, &j))
  1023. break;
  1024. }
  1025. }
  1026. if (count != j) {
  1027. error("agent: Retry count (%d) != actual count (%d)",
  1028. count, j);
  1029. agent_arg_ptr->node_count = j;
  1030. }
  1031. debug2("Queue RPC msg_type=%u, nodes=%d for retry",
  1032. agent_arg_ptr->msg_type, j);
  1033. /* add the requeust to a list */
  1034. queued_req_ptr = xmalloc(sizeof(queued_request_t));
  1035. queued_req_ptr->agent_arg_ptr = agent_arg_ptr;
  1036. queued_req_ptr->last_attempt = time(NULL);
  1037. slurm_mutex_lock(&retry_mutex);
  1038. if (retry_list == NULL) {
  1039. retry_list = list_create(_list_delete_retry);
  1040. if (retry_list == NULL)
  1041. fatal("list_create failed");
  1042. }
  1043. if (list_append(retry_list, (void *) queued_req_ptr) == 0)
  1044. fatal("list_append failed");
  1045. slurm_mutex_unlock(&retry_mutex);
  1046. }
  1047. /*
  1048. * _list_delete_retry - delete an entry from the retry list,
  1049. * see common/list.h for documentation
  1050. */
  1051. static void _list_delete_retry(void *retry_entry)
  1052. {
  1053. queued_request_t *queued_req_ptr;
  1054. if (! retry_entry)
  1055. return;
  1056. queued_req_ptr = (queued_request_t *) retry_entry;
  1057. _purge_agent_args(queued_req_ptr->agent_arg_ptr);
  1058. xfree(queued_req_ptr);
  1059. }
  1060. /*
  1061. * agent_retry - Agent for retrying pending RPCs. One pending request is
  1062. * issued if it has been pending for at least min_wait seconds
  1063. * IN min_wait - Minimum wait time between re-issue of a pending RPC
  1064. * IN mai_too - Send pending email too, note this performed using a
  1065. * fork/waitpid, so it can take longer than just creating a pthread
  1066. * to send RPCs
  1067. * RET count of queued requests remaining
  1068. */
  1069. extern int agent_retry (int min_wait, bool mail_too)
  1070. {
  1071. int list_size = 0, rc;
  1072. time_t now = time(NULL);
  1073. queued_request_t *queued_req_ptr = NULL;
  1074. agent_arg_t *agent_arg_ptr = NULL;
  1075. ListIterator retry_iter;
  1076. slurm_mutex_lock(&retry_mutex);
  1077. if (retry_list) {
  1078. static time_t last_msg_time = (time_t) 0;
  1079. uint32_t msg_type[5], i = 0;
  1080. list_size = list_count(retry_list);
  1081. if ((list_size > MAX_AGENT_CNT)
  1082. && (difftime(now, last_msg_time) > 300)) {
  1083. /* Note sizable backlog of work */
  1084. info("WARNING: agent retry_list size is %d",
  1085. list_size);
  1086. retry_iter = list_iterator_create(retry_list);
  1087. while ((queued_req_ptr = (queued_request_t *)
  1088. list_next(retry_iter))) {
  1089. agent_arg_ptr = queued_req_ptr->agent_arg_ptr;
  1090. msg_type[i++] = agent_arg_ptr->msg_type;
  1091. if (i == 5)
  1092. break;
  1093. }
  1094. list_iterator_destroy(retry_iter);
  1095. info(" retry_list msg_type=%u,%u,%u,%u,%u",
  1096. msg_type[0], msg_type[1], msg_type[2],
  1097. msg_type[3], msg_type[4]);
  1098. last_msg_time = now;
  1099. }
  1100. }
  1101. if (agent_cnt >= MAX_AGENT_CNT) { /* too much work already */
  1102. slurm_mutex_unlock(&retry_mutex);
  1103. return list_size;
  1104. }
  1105. if (retry_list) {
  1106. /* first try to find a new (never tried) record */
  1107. retry_iter = list_iterator_create(retry_list);
  1108. while ((queued_req_ptr = (queued_request_t *)
  1109. list_next(retry_iter))) {
  1110. rc = _batch_launch_defer(queued_req_ptr);
  1111. if (rc == -1) { /* abort request */
  1112. _purge_agent_args(queued_req_ptr->
  1113. agent_arg_ptr);
  1114. xfree(queued_req_ptr);
  1115. list_remove(retry_iter);
  1116. list_size--;
  1117. continue;
  1118. }
  1119. if (rc > 0)
  1120. continue;
  1121. if (queued_req_ptr->last_attempt == 0) {
  1122. list_remove(retry_iter);
  1123. list_size--;
  1124. break;
  1125. }
  1126. }
  1127. list_iterator_destroy(retry_iter);
  1128. }
  1129. if (retry_list && (queued_req_ptr == NULL)) {
  1130. /* now try to find a requeue request that is
  1131. * relatively old */
  1132. double age = 0;
  1133. retry_iter = list_iterator_create(retry_list);
  1134. /* next try to find an older record to retry */
  1135. while ((queued_req_ptr = (queued_request_t *)
  1136. list_next(retry_iter))) {
  1137. rc = _batch_launch_defer(queued_req_ptr);
  1138. if (rc == -1) { /* abort request */
  1139. _purge_agent_args(queued_req_ptr->
  1140. agent_arg_ptr);
  1141. xfree(queued_req_ptr);
  1142. list_remove(retry_iter);
  1143. list_size--;
  1144. continue;
  1145. }
  1146. if (rc > 0)
  1147. continue;
  1148. age = difftime(now, queued_req_ptr->last_attempt);
  1149. if (age > min_wait) {
  1150. list_remove(retry_iter);
  1151. list_size--;
  1152. break;
  1153. }
  1154. }
  1155. list_iterator_destroy(retry_iter);
  1156. }
  1157. slurm_mutex_unlock(&retry_mutex);
  1158. if (queued_req_ptr) {
  1159. agent_arg_ptr = queued_req_ptr->agent_arg_ptr;
  1160. xfree(queued_req_ptr);
  1161. if (agent_arg_ptr) {
  1162. _spawn_retry_agent(agent_arg_ptr);
  1163. } else
  1164. error("agent_retry found record with no agent_args");
  1165. } else if (mail_too) {
  1166. mail_info_t *mi = NULL;
  1167. slurm_mutex_lock(&mail_mutex);
  1168. if (mail_list)
  1169. mi = (mail_info_t *) list_dequeue(mail_list);
  1170. slurm_mutex_unlock(&mail_mutex);
  1171. if (mi)
  1172. _mail_proc(mi);
  1173. }
  1174. return list_size;
  1175. }
  1176. /*
  1177. * agent_queue_request - put a new request on the queue for execution or
  1178. * execute now if not too busy
  1179. * IN agent_arg_ptr - the request to enqueue
  1180. */
  1181. void agent_queue_request(agent_arg_t *agent_arg_ptr)
  1182. {
  1183. queued_request_t *queued_req_ptr = NULL;
  1184. if (agent_arg_ptr->msg_type == REQUEST_SHUTDOWN) {
  1185. /* execute now */
  1186. pthread_attr_t attr_agent;
  1187. pthread_t thread_agent;
  1188. int rc;
  1189. slurm_attr_init(&attr_agent);
  1190. if (pthread_attr_setdetachstate
  1191. (&attr_agent, PTHREAD_CREATE_DETACHED))
  1192. error("pthread_attr_setdetachstate error %m");
  1193. rc = pthread_create(&thread_agent, &attr_agent,
  1194. agent, (void *) agent_arg_ptr);
  1195. slurm_attr_destroy(&attr_agent);
  1196. if (rc == 0) {
  1197. usleep(10000); /* give agent a chance to start */
  1198. return;
  1199. }
  1200. }
  1201. queued_req_ptr = xmalloc(sizeof(queued_request_t));
  1202. queued_req_ptr->agent_arg_ptr = agent_arg_ptr;
  1203. /* queued_req_ptr->last_attempt = 0; Implicit */
  1204. slurm_mutex_lock(&retry_mutex);
  1205. if (retry_list == NULL) {
  1206. retry_list = list_create(_list_delete_retry);
  1207. if (retry_list == NULL)
  1208. fatal("list_create failed");
  1209. }
  1210. list_append(retry_list, (void *)queued_req_ptr);
  1211. slurm_mutex_unlock(&retry_mutex);
  1212. /* now process the request in a separate pthread
  1213. * (if we can create another pthread to do so) */
  1214. agent_retry(999, false);
  1215. }
  1216. /* _spawn_retry_agent - pthread_create an agent for the given task */
  1217. static void _spawn_retry_agent(agent_arg_t * agent_arg_ptr)
  1218. {
  1219. int retries = 0;
  1220. pthread_attr_t attr_agent;
  1221. pthread_t thread_agent;
  1222. if (agent_arg_ptr == NULL)
  1223. return;
  1224. debug2("Spawning RPC agent for msg_type %u",
  1225. agent_arg_ptr->msg_type);
  1226. slurm_attr_init(&attr_agent);
  1227. if (pthread_attr_setdetachstate(&attr_agent,
  1228. PTHREAD_CREATE_DETACHED))
  1229. error("pthread_attr_setdetachstate error %m");
  1230. while (pthread_create(&thread_agent, &attr_agent,
  1231. agent, (void *) agent_arg_ptr)) {
  1232. error("pthread_create error %m");
  1233. if (++retries > MAX_RETRIES)
  1234. fatal("Can't create pthread");
  1235. usleep(10000); /* sleep and retry */
  1236. }
  1237. slurm_attr_destroy(&attr_agent);
  1238. }
  1239. /* slurmctld_free_batch_job_launch_msg is a variant of
  1240. * slurm_free_job_launch_msg because all environment variables currently
  1241. * loaded in one xmalloc buffer (see get_job_env()), which is different
  1242. * from how slurmd assembles the data from a message
  1243. */
  1244. extern void slurmctld_free_batch_job_launch_msg(batch_job_launch_msg_t * msg)
  1245. {
  1246. if (msg) {
  1247. if (msg->environment) {
  1248. xfree(msg->environment[0]);
  1249. xfree(msg->environment);
  1250. }
  1251. slurm_free_job_launch_msg(msg);
  1252. }
  1253. }
  1254. /* agent_purge - purge all pending RPC requests */
  1255. void agent_purge(void)
  1256. {
  1257. if (retry_list) {
  1258. slurm_mutex_lock(&retry_mutex);
  1259. list_destroy(retry_list);
  1260. retry_list = NULL;
  1261. slurm_mutex_unlock(&retry_mutex);
  1262. }
  1263. if (mail_list) {
  1264. slurm_mutex_lock(&mail_mutex);
  1265. list_destroy(mail_list);
  1266. mail_list = NULL;
  1267. slurm_mutex_unlock(&mail_mutex);
  1268. }
  1269. }
  1270. extern int get_agent_count(void)
  1271. {
  1272. return agent_cnt;
  1273. }
  1274. static void _purge_agent_args(agent_arg_t *agent_arg_ptr)
  1275. {
  1276. if (agent_arg_ptr == NULL)
  1277. return;
  1278. hostlist_destroy(agent_arg_ptr->hostlist);
  1279. xfree(agent_arg_ptr->addr);
  1280. if (agent_arg_ptr->msg_args) {
  1281. if (agent_arg_ptr->msg_type == REQUEST_BATCH_JOB_LAUNCH)
  1282. slurmctld_free_batch_job_launch_msg(agent_arg_ptr->
  1283. msg_args);
  1284. else if (agent_arg_ptr->msg_type ==
  1285. RESPONSE_RESOURCE_ALLOCATION)
  1286. slurm_free_resource_allocation_response_msg(
  1287. agent_arg_ptr->msg_args);
  1288. else if ((agent_arg_ptr->msg_type == REQUEST_ABORT_JOB) ||
  1289. (agent_arg_ptr->msg_type == REQUEST_TERMINATE_JOB) ||
  1290. (agent_arg_ptr->msg_type == REQUEST_KILL_PREEMPTED) ||
  1291. (agent_arg_ptr->msg_type == REQUEST_KILL_TIMELIMIT))
  1292. slurm_free_kill_job_msg(agent_arg_ptr->msg_args);
  1293. else if (agent_arg_ptr->msg_type == SRUN_USER_MSG)
  1294. slurm_free_srun_user_msg(agent_arg_ptr->msg_args);
  1295. else if (agent_arg_ptr->msg_type == SRUN_EXEC)
  1296. slurm_free_srun_exec_msg(agent_arg_ptr->msg_args);
  1297. else if (agent_arg_ptr->msg_type == SRUN_NODE_FAIL)
  1298. slurm_free_srun_node_fail_msg(agent_arg_ptr->msg_args);
  1299. else if (agent_arg_ptr->msg_type == SRUN_STEP_MISSING)
  1300. slurm_free_srun_step_missing_msg(
  1301. agent_arg_ptr->msg_args);
  1302. else if (agent_arg_ptr->msg_type == SRUN_STEP_SIGNAL)
  1303. slurm_free_job_step_kill_msg(
  1304. agent_arg_ptr->msg_args);
  1305. else if (agent_arg_ptr->msg_type == REQUEST_JOB_NOTIFY)
  1306. slurm_free_job_notify_msg(agent_arg_ptr->msg_args);
  1307. else if (agent_arg_ptr->msg_type == REQUEST_SUSPEND_INT)
  1308. slurm_free_suspend_int_msg(agent_arg_ptr->msg_args);
  1309. else
  1310. xfree(agent_arg_ptr->msg_args);
  1311. }
  1312. xfree(agent_arg_ptr);
  1313. }
  1314. static mail_info_t *_mail_alloc(void)
  1315. {
  1316. return xmalloc(sizeof(mail_info_t));
  1317. }
  1318. static void _mail_free(void *arg)
  1319. {
  1320. mail_info_t *mi = (mail_info_t *) arg;
  1321. if (mi) {
  1322. xfree(mi->user_name);
  1323. xfree(mi->message);
  1324. xfree(mi);
  1325. }
  1326. }
  1327. /* process an email request and free the record */
  1328. static void _mail_proc(mail_info_t *mi)
  1329. {
  1330. pid_t pid;
  1331. pid = fork();
  1332. if (pid < 0) { /* error */
  1333. error("fork(): %m");
  1334. } else if (pid == 0) { /* child */
  1335. int fd;
  1336. (void) close(0);
  1337. (void) close(1);
  1338. (void) close(2);
  1339. fd = open("/dev/null", O_RDWR); // 0
  1340. if(dup(fd) == -1) // 1
  1341. error("Couldn't do a dup for 1: %m");
  1342. if(dup(fd) == -1) // 2
  1343. error("Couldn't do a dup for 2 %m");
  1344. execle(slurmctld_conf.mail_prog, "mail",
  1345. "-s", mi->message, mi->user_name,
  1346. NULL, NULL);
  1347. error("Failed to exec %s: %m",
  1348. slurmctld_conf.mail_prog);
  1349. exit(1);
  1350. } else { /* parent */
  1351. waitpid(pid, NULL, 0);
  1352. }
  1353. _mail_free(mi);
  1354. return;
  1355. }
  1356. static char *_mail_type_str(uint16_t mail_type)
  1357. {
  1358. if (mail_type == MAIL_JOB_BEGIN)
  1359. return "Began";
  1360. if (mail_type == MAIL_JOB_END)
  1361. return "Ended";
  1362. if (mail_type == MAIL_JOB_FAIL)
  1363. return "Failed";
  1364. if (mail_type == MAIL_JOB_REQUEUE)
  1365. return "Requeued";
  1366. return "unknown";
  1367. }
  1368. static void _set_job_time(struct job_record *job_ptr, uint16_t mail_type,
  1369. char *buf, int buf_len)
  1370. {
  1371. time_t interval = NO_VAL;
  1372. buf[0] = '\0';
  1373. if ((mail_type == MAIL_JOB_BEGIN) && job_ptr->start_time &&
  1374. job_ptr->details && job_ptr->details->submit_time) {
  1375. interval = job_ptr->start_time - job_ptr->details->submit_time;
  1376. snprintf(buf, buf_len, ", Queued time ");
  1377. secs2time_str(interval, buf+14, buf_len-14);
  1378. }
  1379. if (((mail_type == MAIL_JOB_END) || (mail_type == MAIL_JOB_FAIL) ||
  1380. (mail_type == MAIL_JOB_REQUEUE)) &&
  1381. (job_ptr->start_time && job_ptr->end_time)) {
  1382. if (job_ptr->suspend_time) {
  1383. interval = job_ptr->end_time - job_ptr->suspend_time;
  1384. interval += job_ptr->pre_sus_time;
  1385. } else
  1386. interval = job_ptr->end_time - job_ptr->start_time;
  1387. snprintf(buf, buf_len, ", Run time ");
  1388. secs2time_str(interval, buf+11, buf_len-11);
  1389. }
  1390. }
  1391. /*
  1392. * mail_job_info - Send e-mail notice of job state change
  1393. * IN job_ptr - job identification
  1394. * IN state_type - job transition type, see MAIL_JOB in slurm.h
  1395. */
  1396. extern void mail_job_info (struct job_record *job_ptr, uint16_t mail_type)
  1397. {
  1398. char job_time[128];
  1399. mail_info_t *mi = _mail_alloc();
  1400. if (!job_ptr->mail_user)
  1401. mi->user_name = uid_to_string((uid_t)job_ptr->user_id);
  1402. else
  1403. mi->user_name = xstrdup(job_ptr->mail_user);
  1404. mi->message = xmalloc(256);
  1405. _set_job_time(job_ptr, mail_type, job_time, sizeof(job_time));
  1406. sprintf(mi->message, "SLURM Job_id=%u Name=%.24s %s%s",
  1407. job_ptr->job_id, job_ptr->name,
  1408. _mail_type_str(mail_type), job_time);
  1409. debug("email msg to %s: %s", mi->user_name, mi->message);
  1410. slurm_mutex_lock(&mail_mutex);
  1411. if (!mail_list) {
  1412. mail_list = list_create(_mail_free);
  1413. if (!mail_list)
  1414. fatal("list_create failed");
  1415. }
  1416. if (!list_enqueue(mail_list, (void *) mi))
  1417. fatal("list_enqueue failed");
  1418. slurm_mutex_unlock(&mail_mutex);
  1419. return;
  1420. }
  1421. /* Test if a batch launch request should be defered
  1422. * RET -1: abort the request, pending job cancelled
  1423. * 0: execute the request now
  1424. * 1: defer the request
  1425. */
  1426. static int _batch_launch_defer(queued_request_t *queued_req_ptr)
  1427. {
  1428. agent_arg_t *agent_arg_ptr;
  1429. batch_job_launch_msg_t *launch_msg_ptr;
  1430. time_t now = time(NULL);
  1431. struct job_record *job_ptr;
  1432. int delay_time, nodes_ready = 0, tmp;
  1433. agent_arg_ptr = queued_req_ptr->agent_arg_ptr;
  1434. if (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH)
  1435. return 0;
  1436. if (difftime(now, queued_req_ptr->last_attempt) < 10) {
  1437. /* Reduce overhead by only testing once every 10 secs */
  1438. return 1;
  1439. }
  1440. launch_msg_ptr = (batch_job_launch_msg_t *)agent_arg_ptr->msg_args;
  1441. job_ptr = find_job_record(launch_msg_ptr->job_id);
  1442. if ((job_ptr == NULL) ||
  1443. (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) {
  1444. info("agent(batch_launch): removed pending request for "
  1445. "cancelled job %u",
  1446. launch_msg_ptr->job_id);
  1447. return -1; /* job cancelled while waiting */
  1448. }
  1449. if (job_ptr->wait_all_nodes) {
  1450. (void) job_node_ready(launch_msg_ptr->job_id, &tmp);
  1451. if (tmp == (READY_JOB_STATE | READY_NODE_STATE)) {
  1452. nodes_ready = 1;
  1453. if (launch_msg_ptr->alias_list &&
  1454. !strcmp(launch_msg_ptr->alias_list, "TBD")) {
  1455. /* Update launch RPC with correct node
  1456. * aliases */
  1457. struct job_record *job_ptr;
  1458. job_ptr = find_job_record(launch_msg_ptr->
  1459. job_id);
  1460. xfree(launch_msg_ptr->alias_list);
  1461. launch_msg_ptr->alias_list = xstrdup(job_ptr->
  1462. alias_list);
  1463. }
  1464. }
  1465. } else {
  1466. #ifdef HAVE_FRONT_END
  1467. nodes_ready = 1;
  1468. #else
  1469. struct node_record *node_ptr;
  1470. char *hostname;
  1471. hostname = hostlist_deranged_string_xmalloc(
  1472. agent_arg_ptr->hostlist);
  1473. node_ptr = find_node_record(hostname);
  1474. if (node_ptr == NULL) {
  1475. error("agent(batch_launch) removed pending request for "
  1476. "job %u, missing node %s",
  1477. launch_ms

Large files files are truncated, but you can click here to view the full file