PageRenderTime 52ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 1ms

/src/plugins/sched/wiki2/msg.c

https://github.com/cfenoy/slurm
C | 812 lines | 624 code | 80 blank | 108 comment | 167 complexity | 00542fb23f90402717ed237bf1743b37 MD5 | raw file
Possible License(s): GPL-2.0, AGPL-1.0
  1. /*****************************************************************************\
  2. * msg.c - Message/communcation manager for Wiki plugin
  3. *****************************************************************************
  4. * Copyright (C) 2006-2007 The Regents of the University of California.
  5. * Copyright (C) 2008-2009 Lawrence Livermore National Security.
  6. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  7. * Written by Morris Jette <jette1@llnl.gov>
  8. * CODE-OCEC-09-009. All rights reserved.
  9. *
  10. * This file is part of SLURM, a resource management program.
  11. * For details, see <http://www.schedmd.com/slurmdocs/>.
  12. * Please also read the included file: DISCLAIMER.
  13. *
  14. * SLURM is free software; you can redistribute it and/or modify it under
  15. * the terms of the GNU General Public License as published by the Free
  16. * Software Foundation; either version 2 of the License, or (at your option)
  17. * any later version.
  18. *
  19. * In addition, as a special exception, the copyright holders give permission
  20. * to link the code of portions of this program with the OpenSSL library under
  21. * certain conditions as described in each individual source file, and
  22. * distribute linked combinations including the two. You must obey the GNU
  23. * General Public License in all respects for all of the code used other than
  24. * OpenSSL. If you modify file(s) with this exception, you may extend this
  25. * exception to your version of the file(s), but you are not obligated to do
  26. * so. If you do not wish to do so, delete this exception statement from your
  27. * version. If you delete this exception statement from all source files in
  28. * the program, then also delete it here.
  29. *
  30. * SLURM is distributed in the hope that it will be useful, but WITHOUT ANY
  31. * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  32. * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  33. * details.
  34. *
  35. * You should have received a copy of the GNU General Public License along
  36. * with SLURM; if not, write to the Free Software Foundation, Inc.,
  37. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  38. \*****************************************************************************/
  39. #include "slurm/slurm.h"
  40. #include "src/common/uid.h"
  41. #include "src/slurmctld/locks.h"
  42. #include "src/plugins/sched/wiki2/crypto.h"
  43. #include "src/plugins/sched/wiki2/msg.h"
  44. #include <sys/poll.h>
  45. #define _DEBUG 0
  46. /* When a remote socket closes on AIX, we have seen poll() return EAGAIN
  47. * indefinitely for a pending write request. Rather than locking up
  48. * slurmctld's wiki interface, abort after MAX_RETRIES poll() failures. */
  49. #define MAX_RETRIES 10
  50. static bool thread_running = false;
  51. static bool thread_shutdown = false;
  52. static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER;
  53. static pthread_t msg_thread_id;
  54. static char *err_msg;
  55. static int err_code;
  56. static uint16_t sched_port;
  57. /* Global configuration parameters */
  58. char auth_key[KEY_SIZE] = "";
  59. char e_host[E_HOST_SIZE] = "";
  60. char e_host_bu[E_HOST_SIZE] = "";
  61. uint16_t e_port = 0;
  62. struct part_record *exclude_part_ptr[EXC_PART_CNT];
  63. struct part_record *hide_part_ptr[HIDE_PART_CNT];
  64. struct part_record *hide_part_nodes_ptr[HIDE_PART_CNT];
  65. uint32_t first_job_id;
  66. uint16_t job_aggregation_time = 10; /* Default value is 10 seconds */
  67. int init_prio_mode = PRIO_HOLD;
  68. uint16_t kill_wait;
  69. uint16_t use_host_exp = 0;
  70. static char * _get_wiki_conf_path(void);
  71. static void * _msg_thread(void *no_data);
  72. static int _parse_msg(char *msg, char **req);
  73. static void _proc_msg(slurm_fd_t new_fd, char *msg);
  74. static size_t _read_bytes(int fd, char *buf, size_t size);
  75. static char * _recv_msg(slurm_fd_t new_fd);
  76. static size_t _send_msg(slurm_fd_t new_fd, char *buf, size_t size);
  77. static void _send_reply(slurm_fd_t new_fd, char *response);
  78. static size_t _write_bytes(int fd, char *buf, size_t size);
  79. /*****************************************************************************\
  80. * spawn message hander thread
  81. \*****************************************************************************/
  82. extern int spawn_msg_thread(void)
  83. {
  84. pthread_attr_t thread_attr_msg;
  85. pthread_mutex_lock( &thread_flag_mutex );
  86. if (thread_running) {
  87. error("Wiki thread already running, not starting another");
  88. pthread_mutex_unlock(&thread_flag_mutex);
  89. return SLURM_ERROR;
  90. }
  91. parse_wiki_config();
  92. slurm_attr_init(&thread_attr_msg);
  93. if (pthread_create(&msg_thread_id, &thread_attr_msg,
  94. _msg_thread, NULL))
  95. fatal("pthread_create %m");
  96. (void) event_notify(1235, "Slurm startup");
  97. slurm_attr_destroy(&thread_attr_msg);
  98. thread_running = true;
  99. pthread_mutex_unlock(&thread_flag_mutex);
  100. return SLURM_SUCCESS;
  101. }
  102. /*****************************************************************************\
  103. * terminate message hander thread
  104. \*****************************************************************************/
  105. extern void term_msg_thread(void)
  106. {
  107. pthread_mutex_lock(&thread_flag_mutex);
  108. if (thread_running) {
  109. int fd;
  110. slurm_addr_t addr;
  111. thread_shutdown = true;
  112. /* Open and close a connection to the wiki listening port.
  113. * Allows slurm_accept_msg_conn() to return in
  114. * _msg_thread() so that it can check the thread_shutdown
  115. * flag.
  116. */
  117. slurm_set_addr(&addr, sched_port, "localhost");
  118. fd = slurm_open_stream(&addr);
  119. if (fd != -1) {
  120. /* we don't care if the open failed */
  121. slurm_close_stream(fd);
  122. }
  123. debug2("waiting for sched/wiki2 thread to exit");
  124. pthread_join(msg_thread_id, NULL);
  125. msg_thread_id = 0;
  126. thread_shutdown = false;
  127. thread_running = false;
  128. debug2("join of sched/wiki2 thread was successful");
  129. }
  130. pthread_mutex_unlock(&thread_flag_mutex);
  131. }
  132. /*****************************************************************************\
  133. * message hander thread
  134. \*****************************************************************************/
  135. static void *_msg_thread(void *no_data)
  136. {
  137. slurm_fd_t sock_fd = -1, new_fd;
  138. slurm_addr_t cli_addr;
  139. char *msg;
  140. slurm_ctl_conf_t *conf;
  141. int i;
  142. /* Locks: Write configuration, job, node, and partition */
  143. slurmctld_lock_t config_write_lock = {
  144. WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK };
  145. conf = slurm_conf_lock();
  146. sched_port = conf->schedport;
  147. slurm_conf_unlock();
  148. /* Wait until configuration is completely loaded */
  149. lock_slurmctld(config_write_lock);
  150. unlock_slurmctld(config_write_lock);
  151. /* If SchedulerPort is already taken, keep trying to open it
  152. * once per minute. Slurmctld will continue to function
  153. * during this interval even if nothing can be scheduled. */
  154. for (i=0; (!thread_shutdown); i++) {
  155. if (i > 0)
  156. sleep(60);
  157. sock_fd = slurm_init_msg_engine_port(sched_port);
  158. if (sock_fd != SLURM_SOCKET_ERROR)
  159. break;
  160. error("wiki: slurm_init_msg_engine_port %u %m",
  161. sched_port);
  162. error("wiki: Unable to communicate with Moab");
  163. }
  164. /* Process incoming RPCs until told to shutdown */
  165. while (!thread_shutdown) {
  166. if ((new_fd = slurm_accept_msg_conn(sock_fd, &cli_addr))
  167. == SLURM_SOCKET_ERROR) {
  168. if (errno != EINTR)
  169. error("wiki: slurm_accept_msg_conn %m");
  170. continue;
  171. }
  172. if (thread_shutdown) {
  173. close(new_fd);
  174. break;
  175. }
  176. /* It would be nice to create a pthread for each new
  177. * RPC, but that leaks memory on some systems when
  178. * done from a plugin.
  179. * FIXME: Maintain a pool of pthreads and reuse them. */
  180. err_code = 0;
  181. err_msg = "";
  182. msg = _recv_msg(new_fd);
  183. if (msg) {
  184. _proc_msg(new_fd, msg);
  185. xfree(msg);
  186. }
  187. slurm_close_accepted_conn(new_fd);
  188. }
  189. verbose("wiki: message engine shutdown");
  190. if (sock_fd > 0)
  191. (void) slurm_shutdown_msg_engine(sock_fd);
  192. pthread_exit((void *) 0);
  193. return NULL;
  194. }
  195. /*****************************************************************************\
  196. * _get_wiki_conf_path - return the pathname of the wiki.conf file
  197. * return value must be xfreed
  198. \*****************************************************************************/
  199. static char * _get_wiki_conf_path(void)
  200. {
  201. char *val = getenv("SLURM_CONF");
  202. char *path = NULL;
  203. int i;
  204. if (!val)
  205. val = default_slurm_config_file;
  206. /* Replace file name on end of path */
  207. i = strlen(val) + 10;
  208. path = xmalloc(i);
  209. strcpy(path, val);
  210. val = strrchr(path, (int)'/');
  211. if (val) /* absolute path */
  212. val++;
  213. else /* not absolute path */
  214. val = path;
  215. strcpy(val, "wiki.conf");
  216. return path;
  217. }
  218. /*****************************************************************************\
  219. * parse_wiki_config - Results go into global variables
  220. * RET SLURM_SUCCESS or error code
  221. *
  222. * See "man wiki.conf" for details.
  223. \*****************************************************************************/
  224. extern int parse_wiki_config(void)
  225. {
  226. s_p_options_t options[] = {
  227. {"AuthKey", S_P_STRING},
  228. {"EHost", S_P_STRING},
  229. {"EHostBackup", S_P_STRING},
  230. {"EPort", S_P_UINT16},
  231. {"ExcludePartitions", S_P_STRING},
  232. {"HidePartitionJobs", S_P_STRING},
  233. {"HidePartitionNodes", S_P_STRING},
  234. {"HostFormat", S_P_UINT16},
  235. {"JobAggregationTime", S_P_UINT16},
  236. {"JobPriority", S_P_STRING},
  237. {NULL} };
  238. s_p_hashtbl_t *tbl;
  239. char *exclude_partitions, *hide_partitions, *hide_part_nodes;
  240. char *key = NULL, *priority_mode = NULL, *wiki_conf;
  241. struct stat buf;
  242. slurm_ctl_conf_t *conf;
  243. int i;
  244. /* Set default values */
  245. for (i=0; i<EXC_PART_CNT; i++)
  246. exclude_part_ptr[i] = NULL;
  247. for (i=0; i<HIDE_PART_CNT; i++)
  248. hide_part_ptr[i] = NULL;
  249. for (i=0; i<HIDE_PART_CNT; i++)
  250. hide_part_nodes_ptr[i] = NULL;
  251. conf = slurm_conf_lock();
  252. strncpy(e_host, conf->control_addr, sizeof(e_host));
  253. if (conf->backup_addr) {
  254. strncpy(e_host_bu, conf->backup_addr,
  255. sizeof(e_host));
  256. }
  257. kill_wait = conf->kill_wait;
  258. first_job_id = conf->first_job_id;
  259. slurm_conf_unlock();
  260. wiki_conf = _get_wiki_conf_path();
  261. if ((wiki_conf == NULL) || (stat(wiki_conf, &buf) == -1)) {
  262. fatal("No wiki.conf file (%s)", wiki_conf);
  263. xfree(wiki_conf);
  264. return SLURM_SUCCESS;
  265. }
  266. debug("Reading wiki.conf file (%s)",wiki_conf);
  267. tbl = s_p_hashtbl_create(options);
  268. if (s_p_parse_file(tbl, NULL, wiki_conf, false) == SLURM_ERROR)
  269. fatal("something wrong with opening/reading wiki.conf file");
  270. if (! s_p_get_string(&key, "AuthKey", tbl))
  271. fatal("No wiki_conf AuthKey specified");
  272. else {
  273. strncpy(auth_key, key, sizeof(auth_key));
  274. xfree(key);
  275. }
  276. if ( s_p_get_string(&key, "EHost", tbl)) {
  277. strncpy(e_host, key, sizeof(e_host));
  278. xfree(key);
  279. } else
  280. debug("wiki: Using ControlAddr for EHost value");
  281. if ( s_p_get_string(&key, "EHostBackup", tbl)) {
  282. strncpy(e_host_bu, key, sizeof(e_host_bu));
  283. xfree(key);
  284. }
  285. s_p_get_uint16(&e_port, "EPort", tbl);
  286. s_p_get_uint16(&job_aggregation_time, "JobAggregationTime", tbl);
  287. if (s_p_get_string(&exclude_partitions, "ExcludePartitions", tbl)) {
  288. char *tok = NULL, *tok_p = NULL;
  289. tok = strtok_r(exclude_partitions, ",", &tok_p);
  290. i = 0;
  291. while (tok) {
  292. if (i >= EXC_PART_CNT) {
  293. error("ExcludePartitions has too many entries "
  294. "skipping %s and later entries", tok);
  295. break;
  296. }
  297. exclude_part_ptr[i] = find_part_record(tok);
  298. if (exclude_part_ptr[i])
  299. i++;
  300. else
  301. error("ExcludePartitions %s not found", tok);
  302. tok = strtok_r(NULL, ",", &tok_p);
  303. }
  304. }
  305. if (s_p_get_string(&hide_partitions, "HidePartitionJobs", tbl)) {
  306. char *tok = NULL, *tok_p = NULL;
  307. tok = strtok_r(hide_partitions, ",", &tok_p);
  308. i = 0;
  309. while (tok) {
  310. if (i >= HIDE_PART_CNT) {
  311. error("HidePartitionJobs has too many entries "
  312. "skipping %s and later entries", tok);
  313. break;
  314. }
  315. hide_part_ptr[i] = find_part_record(tok);
  316. if (hide_part_ptr[i])
  317. i++;
  318. else
  319. error("HidePartitionJobs %s not found", tok);
  320. tok = strtok_r(NULL, ",", &tok_p);
  321. }
  322. }
  323. if (s_p_get_string(&hide_part_nodes, "HidePartitionNodes", tbl)) {
  324. char *tok = NULL, *tok_p = NULL;
  325. tok = strtok_r(hide_part_nodes, ",", &tok_p);
  326. i = 0;
  327. while (tok) {
  328. if (i >= HIDE_PART_CNT) {
  329. error("HidePartitionNodes has too many entries "
  330. "skipping %s and later entries", tok);
  331. break;
  332. }
  333. hide_part_nodes_ptr[i] = find_part_record(tok);
  334. if (hide_part_nodes_ptr[i])
  335. i++;
  336. else
  337. error("HidePartitionNodes %s not found", tok);
  338. tok = strtok_r(NULL, ",", &tok_p);
  339. }
  340. }
  341. if (s_p_get_string(&priority_mode, "JobPriority", tbl)) {
  342. if (strcasecmp(priority_mode, "hold") == 0)
  343. init_prio_mode = PRIO_HOLD;
  344. else if (strcasecmp(priority_mode, "run") == 0)
  345. init_prio_mode = PRIO_DECREMENT;
  346. else
  347. error("Invalid value for JobPriority in wiki.conf");
  348. xfree(priority_mode);
  349. }
  350. s_p_get_uint16(&use_host_exp, "HostFormat", tbl);
  351. s_p_hashtbl_destroy(tbl);
  352. xfree(wiki_conf);
  353. #if _DEBUG
  354. info("AuthKey = %s", auth_key);
  355. info("EHost = %s", e_host);
  356. info("EHostBackup = %s", e_host_bu);
  357. info("EPort = %u", e_port);
  358. info("HostFormat = %u", use_host_exp);
  359. info("JobAggregationTime = %u sec", job_aggregation_time);
  360. info("JobPriority = %s", init_prio_mode ? "run" : "hold");
  361. info("KillWait = %u sec", kill_wait);
  362. for (i=0; i<EXC_PART_CNT; i++) {
  363. if (!exclude_part_ptr[i])
  364. continue;
  365. info("ExcludePartitions = %s", exclude_part_ptr[i]->name);
  366. }
  367. for (i=0; i<HIDE_PART_CNT; i++) {
  368. if (!hide_part_ptr[i])
  369. continue;
  370. info("HidePartitionJobs = %s", hide_part_ptr[i]->name);
  371. }
  372. for (i=0; i<HIDE_PART_CNT; i++) {
  373. if (!hide_part_nodes_ptr[i])
  374. continue;
  375. info("HidePartitionNodes = %s", hide_part_nodes_ptr[i]->name);
  376. }
  377. #endif
  378. return SLURM_SUCCESS;
  379. }
  380. /*
  381. * Return a string containing any scheduling plugin configuration information
  382. * that we want to expose via "scontrol show configuration".
  383. * NOTE: the caller must xfree the returned pointer
  384. */
  385. extern char * get_wiki_conf(void)
  386. {
  387. int i, first;
  388. char buf[32], *conf = NULL;
  389. snprintf(buf, sizeof(buf), "HostFormat=%u", use_host_exp);
  390. xstrcat(conf, buf);
  391. snprintf(buf, sizeof(buf), ";JobAggregationTime=%u",
  392. job_aggregation_time);
  393. xstrcat(conf, buf);
  394. first = 1;
  395. for (i=0; i<EXC_PART_CNT; i++) {
  396. if (!exclude_part_ptr[i])
  397. continue;
  398. if (first) {
  399. xstrcat(conf, ";ExcludePartitions=");
  400. first = 0;
  401. } else
  402. xstrcat(conf, ",");
  403. xstrcat(conf, exclude_part_ptr[i]->name);
  404. }
  405. first = 1;
  406. for (i=0; i<HIDE_PART_CNT; i++) {
  407. if (!hide_part_ptr[i])
  408. continue;
  409. if (first) {
  410. xstrcat(conf, ";HidePartitionJobs=");
  411. first = 0;
  412. } else
  413. xstrcat(conf, ",");
  414. xstrcat(conf, hide_part_ptr[i]->name);
  415. }
  416. first = 1;
  417. for (i=0; i<HIDE_PART_CNT; i++) {
  418. if (!hide_part_nodes_ptr[i])
  419. continue;
  420. if (first) {
  421. xstrcat(conf, ";HidePartitionNodes=");
  422. first = 0;
  423. } else
  424. xstrcat(conf, ",");
  425. xstrcat(conf, hide_part_nodes_ptr[i]->name);
  426. }
  427. return conf;
  428. }
  429. static size_t _read_bytes(int fd, char *buf, size_t size)
  430. {
  431. size_t bytes_remaining, bytes_read;
  432. char *ptr;
  433. struct pollfd ufds;
  434. int rc;
  435. bytes_remaining = size;
  436. size = 0;
  437. ufds.fd = fd;
  438. ufds.events = POLLIN;
  439. ptr = buf;
  440. while (bytes_remaining > 0) {
  441. rc = poll(&ufds, 1, 10000); /* 10 sec timeout */
  442. if (rc == 0) /* timed out */
  443. break;
  444. if ((rc == -1) && /* some error */
  445. ((errno== EINTR) || (errno == EAGAIN)))
  446. continue;
  447. if ((ufds.revents & POLLIN) == 0) /* some poll error */
  448. break;
  449. bytes_read = read(fd, ptr, bytes_remaining);
  450. if (bytes_read <= 0)
  451. break;
  452. bytes_remaining -= bytes_read;
  453. size += bytes_read;
  454. ptr += bytes_read;
  455. }
  456. return size;
  457. }
  458. static size_t _write_bytes(int fd, char *buf, size_t size)
  459. {
  460. size_t bytes_remaining, bytes_written;
  461. char *ptr;
  462. struct pollfd ufds;
  463. int rc, retry_cnt = 0;
  464. bytes_remaining = size;
  465. size = 0;
  466. ptr = buf;
  467. ufds.fd = fd;
  468. ufds.events = POLLOUT;
  469. while (bytes_remaining > 0) {
  470. rc = poll(&ufds, 1, 10000); /* 10 sec timeout */
  471. if (rc == 0) /* timed out */
  472. break;
  473. if ((rc == -1) && /* some error */
  474. ((errno== EINTR) || (errno == EAGAIN))) {
  475. if ((retry_cnt++) >= MAX_RETRIES) {
  476. error("wiki: repeated poll errors for "
  477. "write: %m");
  478. break;
  479. }
  480. continue;
  481. }
  482. if ((ufds.revents & POLLOUT) == 0) /* some poll error */
  483. break;
  484. bytes_written = write(fd, ptr, bytes_remaining);
  485. if (bytes_written <= 0)
  486. break;
  487. bytes_remaining -= bytes_written;
  488. size += bytes_written;
  489. ptr += bytes_written;
  490. }
  491. return size;
  492. }
  493. /*****************************************************************************\
  494. * Read a message (request) from specified file descriptor
  495. *
  496. * RET - The message which must be xfreed or
  497. * NULL on error
  498. \*****************************************************************************/
  499. static char * _recv_msg(slurm_fd_t new_fd)
  500. {
  501. char header[10];
  502. unsigned long size;
  503. char *buf;
  504. if (_read_bytes((int) new_fd, header, 9) != 9) {
  505. err_code = -240;
  506. err_msg = "failed to read message header";
  507. error("wiki: failed to read message header %m");
  508. return NULL;
  509. }
  510. if (sscanf(header, "%lu", &size) != 1) {
  511. err_code = -244;
  512. err_msg = "malformed message header";
  513. error("wiki: malformed message header (%s)", header);
  514. return NULL;
  515. }
  516. buf = xmalloc(size + 1); /* need '\0' on end to print */
  517. if (_read_bytes((int) new_fd, buf, size) != size) {
  518. err_code = -246;
  519. err_msg = "unable to read all message data";
  520. error("wiki: unable to read data message");
  521. xfree(buf);
  522. return NULL;
  523. }
  524. if (slurm_get_debug_flags() & DEBUG_FLAG_WIKI)
  525. info("wiki msg recv:%s", buf);
  526. return buf;
  527. }
  528. /*****************************************************************************\
  529. * Send a message (response) to specified file descriptor
  530. *
  531. * RET - Number of data bytes written (excludes header)
  532. \*****************************************************************************/
  533. static size_t _send_msg(slurm_fd_t new_fd, char *buf, size_t size)
  534. {
  535. char header[10];
  536. size_t data_sent;
  537. if (slurm_get_debug_flags() & DEBUG_FLAG_WIKI)
  538. info("wiki msg send:%s", buf);
  539. (void) sprintf(header, "%08lu\n", (unsigned long) size);
  540. if (_write_bytes((int) new_fd, header, 9) != 9) {
  541. error("wiki: failed to write message header %m");
  542. return 0;
  543. }
  544. data_sent = _write_bytes((int) new_fd, buf, size);
  545. if (data_sent != size) {
  546. error("wiki: unable to write data message (%lu of %lu) %m",
  547. (long unsigned) data_sent, (long unsigned) size);
  548. }
  549. return data_sent;
  550. }
  551. /*****************************************************************************\
  552. * Parse and checksum a wiki request
  553. * msg IN - message to parse
  554. * req OUT - pointer the request portion of the message
  555. * RET 0 on success, -1 on error
  556. \*****************************************************************************/
  557. static int _parse_msg(char *msg, char **req)
  558. {
  559. char sum[20]; /* format is "CK=%08x%08x" */
  560. char *auth_ptr = strstr(msg, "AUTH=");
  561. char *dt_ptr = strstr(msg, "DT=");
  562. char *ts_ptr = strstr(msg, "TS=");
  563. char *cmd_ptr = strstr(msg, "CMD=");
  564. time_t ts, now = time(NULL);
  565. uint32_t delta_t;
  566. if ((auth_key[0] == '\0') && cmd_ptr) {
  567. /* No authentication required */
  568. *req = cmd_ptr;
  569. return 0;
  570. }
  571. if (!auth_ptr) {
  572. err_code = -300;
  573. err_msg = "request lacks AUTH";
  574. error("wiki: request lacks AUTH=");
  575. return -1;
  576. }
  577. if (!dt_ptr) {
  578. err_code = -300;
  579. err_msg = "request lacks DT";
  580. error("wiki: request lacks DT=");
  581. return -1;
  582. }
  583. if (!ts_ptr) {
  584. err_code = -300;
  585. err_msg = "request lacks TS";
  586. error("wiki: request lacks TS=");
  587. return -1;
  588. }
  589. ts = strtoul((ts_ptr+3), NULL, 10);
  590. if (ts < now)
  591. delta_t = (uint32_t) difftime(now, ts);
  592. else
  593. delta_t = (uint32_t) difftime(ts, now);
  594. if (delta_t > 300) {
  595. err_code = -350;
  596. err_msg = "TS value too far from NOW";
  597. error("wiki: TimeStamp too far from NOW (%u secs)",
  598. delta_t);
  599. return -1;
  600. }
  601. if (auth_key[0] != '\0') {
  602. checksum(sum, auth_key, ts_ptr);
  603. if (strncmp(sum, msg, 19) != 0) {
  604. err_code = -422;
  605. err_msg = "bad checksum";
  606. error("wiki: message checksum error, "
  607. "check AuthKey in wiki.conf");
  608. return -1;
  609. }
  610. }
  611. *req = dt_ptr + 3;
  612. return 0;
  613. }
  614. /*****************************************************************************\
  615. * Parse, process and respond to a request
  616. \*****************************************************************************/
  617. static void _proc_msg(slurm_fd_t new_fd, char *msg)
  618. {
  619. DEF_TIMERS;
  620. char *req, *cmd_ptr, *msg_type = NULL;
  621. char response[128];
  622. if (new_fd < 0)
  623. return;
  624. START_TIMER;
  625. if (!msg) {
  626. err_code = -300;
  627. err_msg = "NULL request message";
  628. error("wiki: NULL request message");
  629. goto resp_msg;
  630. }
  631. if (_parse_msg(msg, &req) != 0)
  632. goto resp_msg;
  633. cmd_ptr = strstr(req, "CMD=");
  634. if (cmd_ptr == NULL) {
  635. err_code = -300;
  636. err_msg = "request lacks CMD";
  637. error("wiki: request lacks CMD");
  638. goto resp_msg;
  639. }
  640. cmd_ptr +=4;
  641. err_code = 0;
  642. if (strncmp(cmd_ptr, "GETJOBS", 7) == 0) {
  643. msg_type = "wiki:GETJOBS";
  644. if (!get_jobs(cmd_ptr, &err_code, &err_msg))
  645. goto free_resp_msg;
  646. } else if (strncmp(cmd_ptr, "GETNODES", 8) == 0) {
  647. msg_type = "wiki:GETNODES";
  648. if (!get_nodes(cmd_ptr, &err_code, &err_msg))
  649. goto free_resp_msg;
  650. } else if (strncmp(cmd_ptr, "STARTJOB", 8) == 0) {
  651. msg_type = "wiki:STARTJOB";
  652. start_job(cmd_ptr, &err_code, &err_msg);
  653. } else if (strncmp(cmd_ptr, "CANCELJOB", 9) == 0) {
  654. msg_type = "wiki:CANCELJOB";
  655. cancel_job(cmd_ptr, &err_code, &err_msg);
  656. } else if (strncmp(cmd_ptr, "REQUEUEJOB", 10) == 0) {
  657. msg_type = "wiki:REQUEUEJOB";
  658. job_requeue_wiki(cmd_ptr, &err_code, &err_msg);
  659. } else if (strncmp(cmd_ptr, "SUSPENDJOB", 10) == 0) {
  660. msg_type = "wiki:SUSPENDJOB";
  661. suspend_job(cmd_ptr, &err_code, &err_msg);
  662. } else if (strncmp(cmd_ptr, "RESUMEJOB", 9) == 0) {
  663. msg_type = "wiki:RESUMEJOB";
  664. resume_job(cmd_ptr, &err_code, &err_msg);
  665. } else if (strncmp(cmd_ptr, "JOBADDTASK", 10) == 0) {
  666. msg_type = "wiki:JOBADDTASK";
  667. job_add_task(cmd_ptr, &err_code, &err_msg);
  668. } else if (strncmp(cmd_ptr, "JOBRELEASETASK", 14) == 0) {
  669. msg_type = "wiki:JOBRELEASETASK";
  670. job_release_task(cmd_ptr, &err_code, &err_msg);
  671. } else if (strncmp(cmd_ptr, "JOBWILLRUN", 10) == 0) {
  672. msg_type = "wiki:JOBWILLRUN";
  673. if (strstr(cmd_ptr, "NODES=")) {
  674. /* Updated format input and output */
  675. if (!job_will_run2(cmd_ptr, &err_code, &err_msg))
  676. goto free_resp_msg;
  677. } else {
  678. if (!job_will_run(cmd_ptr, &err_code, &err_msg))
  679. goto free_resp_msg;
  680. }
  681. } else if (strncmp(cmd_ptr, "MODIFYJOB", 9) == 0) {
  682. msg_type = "wiki:MODIFYJOB";
  683. job_modify_wiki(cmd_ptr, &err_code, &err_msg);
  684. } else if (strncmp(cmd_ptr, "NOTIFYJOB", 9) == 0) {
  685. msg_type = "wiki:NOTIFYJOB";
  686. job_notify_wiki(cmd_ptr, &err_code, &err_msg);
  687. } else if (strncmp(cmd_ptr, "SIGNALJOB", 9) == 0) {
  688. msg_type = "wiki:SIGNALJOB";
  689. job_signal_wiki(cmd_ptr, &err_code, &err_msg);
  690. } else if (strncmp(cmd_ptr, "INITIALIZE", 10) == 0) {
  691. msg_type = "wiki:INITIALIZE";
  692. initialize_wiki(cmd_ptr, &err_code, &err_msg);
  693. } else {
  694. err_code = -300;
  695. err_msg = "unsupported request type";
  696. error("wiki: unrecognized request type: %s", req);
  697. }
  698. END_TIMER2(msg_type);
  699. resp_msg:
  700. snprintf(response, sizeof(response),
  701. "SC=%d RESPONSE=%s", err_code, err_msg);
  702. _send_reply(new_fd, response);
  703. return;
  704. free_resp_msg:
  705. /* Message is pre-formatted by get_jobs and get_nodes
  706. * ONLY if no error. Send message and xfree the buffer. */
  707. _send_reply(new_fd, err_msg);
  708. xfree(err_msg);
  709. return;
  710. }
  711. static void _send_reply(slurm_fd_t new_fd, char *response)
  712. {
  713. size_t i;
  714. char *buf, sum[20], *tmp;
  715. static char uname[64] = "";
  716. i = strlen(response);
  717. i += 100; /* leave room for header */
  718. buf = xmalloc(i);
  719. if (uname[0] == '\0') {
  720. tmp = uid_to_string(getuid());
  721. strncpy(uname, tmp, sizeof(uname));
  722. uname[sizeof(uname) - 1] = '\0';
  723. xfree(tmp);
  724. }
  725. snprintf(buf, i, "CK=dummy67890123456 TS=%u AUTH=%s DT=%s",
  726. (uint32_t) time(NULL), uname, response);
  727. checksum(sum, auth_key, (buf+20)); /* overwrite "CK=dummy..." above */
  728. memcpy(buf, sum, 19);
  729. i = strlen(buf) + 1;
  730. (void) _send_msg(new_fd, buf, i);
  731. xfree(buf);
  732. }