PageRenderTime 30ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 1ms

/globus_gram_job_manager-13.33/globus_gram_job_manager_seg.c

#
C | 1775 lines | 1458 code | 173 blank | 144 comment | 231 complexity | ed1446434fbf867780204e7e6f721ee0 MD5 | raw file
Possible License(s): Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. * Copyright 1999-2010 University of Chicago
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "globus_common.h"
  17. #include "globus_gram_job_manager.h"
  18. #include "globus_scheduler_event_generator.h"
  19. #include "globus_scheduler_event_generator_app.h"
  20. #include <sys/types.h>
  21. #include <utime.h>
  22. #include <regex.h>
  23. typedef struct globus_gram_seg_resume_s
  24. {
  25. globus_gram_job_manager_t * manager;
  26. globus_list_t * events;
  27. }
  28. globus_gram_seg_resume_t;
  29. globus_result_t
  30. globus_l_gram_seg_event_callback(
  31. void * user_arg,
  32. const globus_scheduler_event_t * event);
  33. static
  34. void
  35. globus_l_gram_fork_poll_callback(
  36. void * user_arg);
  37. static
  38. int
  39. globus_l_gram_deliver_event(
  40. globus_gram_jobmanager_request_t * request,
  41. globus_scheduler_event_t * event);
  42. static
  43. void
  44. globus_l_seg_resume_callback(
  45. void * user_arg);
  46. static
  47. int
  48. globus_l_condor_parse_log(
  49. const char * data,
  50. globus_gram_job_manager_ref_t * ref,
  51. globus_fifo_t * events);
  52. static
  53. int
  54. globus_l_condor_read_log(
  55. globus_gram_job_manager_t *manager,
  56. const char *path,
  57. size_t last_size,
  58. char **data);
  59. static
  60. void
  61. globus_l_gram_condor_poll_callback(
  62. void * user_arg);
  63. globus_result_t
  64. globus_gram_job_manager_init_seg(
  65. globus_gram_job_manager_t * manager)
  66. {
  67. globus_result_t result = GLOBUS_SUCCESS;
  68. int rc;
  69. globus_gram_job_manager_log(
  70. manager,
  71. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  72. "event=gram.seg.start level=TRACE module=%s\n",
  73. manager->config->seg_module ? manager->config->seg_module : "fork");
  74. GlobusGramJobManagerLock(manager);
  75. if (manager->config->seg_module == NULL &&
  76. strcmp(manager->config->jobmanager_type, "fork") == 0)
  77. {
  78. globus_reltime_t delay;
  79. GlobusTimeReltimeSet(delay, 1, 0);
  80. result = globus_callback_register_periodic(
  81. &manager->fork_callback_handle,
  82. &delay,
  83. &delay,
  84. globus_l_gram_fork_poll_callback,
  85. manager);
  86. if (result != GLOBUS_SUCCESS)
  87. {
  88. char * errstr;
  89. char * errstr_escaped;
  90. errstr = globus_error_print_friendly(globus_error_peek(result));
  91. errstr_escaped = globus_gram_prepare_log_string(
  92. errstr);
  93. globus_gram_job_manager_log(
  94. manager,
  95. GLOBUS_GRAM_JOB_MANAGER_LOG_WARN,
  96. "event=gram.seg.end level=WARN status=%d "
  97. "reason=\"%s\"\n",
  98. -1,
  99. errstr_escaped ? errstr_escaped : "");
  100. if (errstr_escaped)
  101. {
  102. free(errstr_escaped);
  103. }
  104. if (errstr)
  105. {
  106. free(errstr);
  107. }
  108. goto failed_periodic;
  109. }
  110. }
  111. else if (strcmp(manager->config->jobmanager_type, "condor") == 0)
  112. {
  113. globus_reltime_t delay;
  114. GlobusTimeReltimeSet(delay, 5, 0);
  115. result = globus_callback_register_periodic(
  116. &manager->fork_callback_handle,
  117. &delay,
  118. &delay,
  119. globus_l_gram_condor_poll_callback,
  120. manager);
  121. if (result != GLOBUS_SUCCESS)
  122. {
  123. char * errstr;
  124. char * errstr_escaped;
  125. errstr = globus_error_print_friendly(globus_error_peek(result));
  126. errstr_escaped = globus_gram_prepare_log_string(
  127. errstr);
  128. globus_gram_job_manager_log(
  129. manager,
  130. GLOBUS_GRAM_JOB_MANAGER_LOG_WARN,
  131. "event=gram.seg.end level=WARN status=%d "
  132. "reason=\"%s\"\n",
  133. -1,
  134. errstr_escaped ? errstr_escaped : "");
  135. if (errstr_escaped)
  136. {
  137. free(errstr_escaped);
  138. }
  139. if (errstr)
  140. {
  141. free(errstr);
  142. }
  143. goto failed_periodic;
  144. }
  145. }
  146. else
  147. {
  148. globus_gram_job_manager_log(
  149. manager,
  150. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  151. "event=gram.seg.activate.start level=TRACE module=%s\n",
  152. manager->config->seg_module);
  153. rc = globus_module_activate(GLOBUS_SCHEDULER_EVENT_GENERATOR_MODULE);
  154. if (rc != GLOBUS_SUCCESS)
  155. {
  156. globus_gram_job_manager_log(
  157. manager,
  158. GLOBUS_GRAM_JOB_MANAGER_LOG_ERROR,
  159. "event=gram.seg.activate.end level=ERROR error=%d "
  160. "reason=\"Error activating SEG\"\n",
  161. rc);
  162. goto failed_activate;
  163. }
  164. globus_scheduler_event_generator_set_event_handler(
  165. globus_l_gram_seg_event_callback,
  166. manager);
  167. globus_scheduler_event_generator_set_timestamp(
  168. manager->seg_last_timestamp);
  169. setenv("JOB_MANAGER_SEG_SCHEDULER", manager->config->seg_module, 1);
  170. rc = globus_scheduler_event_generator_load_module(
  171. "job_manager");
  172. if (rc != GLOBUS_SUCCESS)
  173. {
  174. globus_gram_job_manager_log(
  175. manager,
  176. GLOBUS_GRAM_JOB_MANAGER_LOG_ERROR,
  177. "event=gram.seg.end level=ERROR "
  178. "error=%d "
  179. "module=%s "
  180. "reason=\"Error loading job_manager SEG "
  181. "module\"\n",
  182. rc,
  183. manager->config->seg_module);
  184. free(manager->config->seg_module);
  185. manager->config->seg_module = NULL;
  186. goto failed_load;
  187. }
  188. }
  189. manager->seg_started = GLOBUS_TRUE;
  190. failed_load:
  191. failed_activate:
  192. failed_periodic:
  193. GlobusGramJobManagerUnlock(manager);
  194. return result;
  195. }
  196. /* globus_gram_job_manager_init_seg() */
  197. globus_result_t
  198. globus_gram_job_manager_shutdown_seg(
  199. globus_gram_job_manager_t * manager)
  200. {
  201. if (! manager->seg_started)
  202. {
  203. return GLOBUS_SUCCESS;
  204. }
  205. if (manager->fork_callback_handle != GLOBUS_NULL_HANDLE)
  206. {
  207. globus_callback_unregister(
  208. manager->fork_callback_handle,
  209. NULL,
  210. NULL,
  211. NULL);
  212. manager->fork_callback_handle = GLOBUS_NULL_HANDLE;
  213. }
  214. else
  215. {
  216. globus_module_deactivate(GLOBUS_SCHEDULER_EVENT_GENERATOR_MODULE);
  217. }
  218. manager->seg_started = GLOBUS_FALSE;
  219. return GLOBUS_SUCCESS;
  220. }
  221. /* globus_gram_job_manager_shutdown_seg() */
  222. globus_result_t
  223. globus_l_gram_seg_event_callback(
  224. void * user_arg,
  225. const globus_scheduler_event_t * event)
  226. {
  227. int rc;
  228. globus_gram_job_manager_t * manager = user_arg;
  229. globus_gram_jobmanager_request_t * request;
  230. globus_result_t result = GLOBUS_SUCCESS;
  231. globus_scheduler_event_t * new_event;
  232. globus_gram_job_manager_log(
  233. manager,
  234. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  235. "event=gram.seg.event.start level=TRACE segid=\"%s\" "
  236. "state=%d event_ts=%ld\n",
  237. event->job_id,
  238. (int) event->event_type,
  239. (long int) event->timestamp);
  240. if (event->event_type == GLOBUS_SCHEDULER_EVENT_RAW)
  241. {
  242. rc = GLOBUS_SUCCESS;
  243. goto raw_event;
  244. }
  245. result = globus_scheduler_event_copy(&new_event, event);
  246. if (result != GLOBUS_SUCCESS)
  247. {
  248. goto copy_failed;
  249. }
  250. GlobusGramJobManagerLock(manager);
  251. /* Find the job request associated by this job id */
  252. rc = globus_gram_job_manager_add_reference_by_jobid(
  253. manager,
  254. event->job_id,
  255. "SEG event",
  256. &request);
  257. if (rc != GLOBUS_SUCCESS)
  258. {
  259. if (manager->seg_pause_count > 0)
  260. {
  261. /* New submit script is running. Avoid race by adding this to the
  262. * manager-wide queue
  263. */
  264. globus_gram_job_manager_log(
  265. manager,
  266. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  267. "event=gram.seg.event.queue level=TRACE segid=\"%s\"\n",
  268. event->job_id);
  269. rc = globus_fifo_enqueue(&manager->seg_event_queue, new_event);
  270. }
  271. else
  272. {
  273. globus_gram_job_manager_log(
  274. manager,
  275. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  276. "event=gram.seg.event.end level=TRACE segid=\"%s\" "
  277. "reason=\"Event ID doesn't match known job id\"\n",
  278. event->job_id);
  279. }
  280. }
  281. if (rc != GLOBUS_SUCCESS)
  282. {
  283. GlobusGramJobManagerUnlock(manager);
  284. goto manager_event_queue_failed;
  285. }
  286. else if (request == NULL)
  287. {
  288. /* Ignore unwanted event */
  289. GlobusGramJobManagerUnlock(manager);
  290. goto done;
  291. }
  292. else
  293. {
  294. if (event->timestamp > manager->seg_last_timestamp)
  295. {
  296. manager->seg_last_timestamp = event->timestamp;
  297. }
  298. GlobusGramJobManagerUnlock(manager);
  299. rc = globus_l_gram_deliver_event(
  300. request,
  301. new_event);
  302. }
  303. if (rc != GLOBUS_SUCCESS)
  304. {
  305. (void) globus_gram_job_manager_remove_reference(
  306. request->manager,
  307. request->job_contact_path,
  308. "SEG event");
  309. manager_event_queue_failed:
  310. globus_scheduler_event_destroy(new_event);
  311. copy_failed:
  312. raw_event:
  313. ;
  314. }
  315. done:
  316. result = GLOBUS_SUCCESS;
  317. return result;
  318. }
  319. /* globus_l_gram_seg_event_callback() */
  320. void
  321. globus_gram_job_manager_seg_handle_event(
  322. globus_gram_jobmanager_request_t * request)
  323. {
  324. globus_scheduler_event_t * event;
  325. char * subjob_id_ptr = NULL;
  326. size_t subjob_id_len;
  327. globus_bool_t found_subjob_id;
  328. event = globus_fifo_dequeue(&request->seg_event_queue);
  329. if (event->timestamp > request->seg_last_timestamp)
  330. {
  331. /*
  332. * GRAM-145: GRAM5 Job Manager fails to save SEG timestamps in job
  333. * state files
  334. *
  335. * We'll update the SEG timestamp here so that if the job manager
  336. * is restarted it (potentially) ignore events that have already been
  337. * noticed in the job state file.
  338. */
  339. request->seg_last_timestamp = event->timestamp;
  340. }
  341. globus_gram_job_manager_request_log(
  342. request,
  343. GLOBUS_GRAM_JOB_MANAGER_LOG_DEBUG,
  344. "event=gram.handle_seg_event.start "
  345. "level=DEBUG "
  346. "state=%d "
  347. "gramid=%s "
  348. "jobid=\"%s\" "
  349. "\n",
  350. event->event_type,
  351. request->job_contact_path,
  352. event->job_id);
  353. found_subjob_id = GLOBUS_FALSE;
  354. subjob_id_len = strlen(event->job_id);
  355. while (!found_subjob_id)
  356. {
  357. subjob_id_ptr = strstr(request->job_id_string, event->job_id);
  358. if (subjob_id_ptr == NULL)
  359. {
  360. break;
  361. }
  362. if (subjob_id_ptr == request->job_id_string ||
  363. (*(subjob_id_ptr - 1) == ','))
  364. {
  365. /* request->job_id_string starts with this subjob_id, or this
  366. * subjob_id happens after a comma. If it ends with a comma or
  367. * \0, then we've found a match.
  368. */
  369. if (subjob_id_ptr[subjob_id_len] == ',')
  370. {
  371. found_subjob_id = GLOBUS_TRUE;
  372. if (event->event_type == GLOBUS_SCHEDULER_EVENT_DONE ||
  373. event->event_type == GLOBUS_SCHEDULER_EVENT_FAILED)
  374. {
  375. /* Remove this sub job id from the list by moving
  376. * after the comma up until \0 to subjob_id_ptr
  377. */
  378. memmove(subjob_id_ptr,
  379. subjob_id_ptr + subjob_id_len + 1,
  380. strlen(subjob_id_ptr + subjob_id_len + 1) + 1);
  381. }
  382. }
  383. else if (subjob_id_ptr[subjob_id_len] == 0)
  384. {
  385. /* This is the final subjob in the job_id_string */
  386. found_subjob_id = GLOBUS_TRUE;
  387. if (event->event_type == GLOBUS_SCHEDULER_EVENT_DONE ||
  388. event->event_type == GLOBUS_SCHEDULER_EVENT_FAILED)
  389. {
  390. /* Don't need to do memmove here, just null terminate at
  391. * either the initial part of the string if subjob_id is
  392. * the only one in the list, or at the comma otherwise
  393. */
  394. if (subjob_id_ptr != request->job_id_string)
  395. {
  396. *(subjob_id_ptr - 1) = '\0';
  397. }
  398. else
  399. {
  400. request->job_id_string[0] = '\0';
  401. }
  402. }
  403. }
  404. }
  405. }
  406. /* If this is a terminal event (done or failed), we'll update the expected
  407. * terminal state (in the case of a multi-subjob case) and the exit code
  408. * if the job's exit code is currently 0
  409. *
  410. * Thus, if any subjob fails or exits with a non-0 exit code, we will
  411. * propogate that in the job state change notification.
  412. */
  413. if (event->event_type == GLOBUS_SCHEDULER_EVENT_DONE ||
  414. event->event_type == GLOBUS_SCHEDULER_EVENT_FAILED)
  415. {
  416. if (request->expected_terminal_state ==
  417. GLOBUS_GRAM_PROTOCOL_JOB_STATE_DONE)
  418. {
  419. request->expected_terminal_state = event->event_type;
  420. }
  421. if (event->event_type == GLOBUS_SCHEDULER_EVENT_DONE &&
  422. request->exit_code == 0)
  423. {
  424. request->exit_code = event->exit_code;
  425. }
  426. }
  427. /* If the last job terminated or any job moved to active, we'll update the
  428. * job status and potentially send notifications.
  429. */
  430. if (event->event_type != GLOBUS_SCHEDULER_EVENT_DONE &&
  431. event->event_type != GLOBUS_SCHEDULER_EVENT_FAILED)
  432. {
  433. if (globus_i_gram_job_manager_script_valid_state_change(
  434. request,
  435. event->event_type))
  436. {
  437. globus_gram_job_manager_request_set_status(
  438. request,
  439. event->event_type);
  440. request->unsent_status_change = GLOBUS_TRUE;
  441. }
  442. }
  443. else if (*request->job_id_string == '\0')
  444. {
  445. if (globus_i_gram_job_manager_script_valid_state_change(
  446. request,
  447. request->expected_terminal_state))
  448. {
  449. if ((request->expected_terminal_state ==
  450. GLOBUS_GRAM_PROTOCOL_JOB_STATE_DONE) &&
  451. globus_gram_job_manager_rsl_need_stage_out(request))
  452. {
  453. globus_gram_job_manager_request_set_status(
  454. request,
  455. GLOBUS_GRAM_PROTOCOL_JOB_STATE_STAGE_OUT);
  456. }
  457. else
  458. {
  459. globus_gram_job_manager_request_set_status(
  460. request,
  461. request->expected_terminal_state);
  462. }
  463. request->unsent_status_change = GLOBUS_TRUE;
  464. }
  465. }
  466. globus_gram_job_manager_request_log(
  467. request,
  468. GLOBUS_GRAM_JOB_MANAGER_LOG_DEBUG,
  469. "event=gram.handle_seg_event.end "
  470. "level=DEBUG "
  471. "state=%d "
  472. "gramid=%s "
  473. "jobid=\"%s\" "
  474. "\n",
  475. event->event_type,
  476. request->job_contact_path,
  477. event->job_id);
  478. globus_scheduler_event_destroy(event);
  479. (void) globus_gram_job_manager_remove_reference(
  480. request->manager,
  481. request->job_contact_path,
  482. "SEG event");
  483. }
  484. /* globus_gram_job_manager_seg_handle_event() */
  485. void
  486. globus_gram_job_manager_seg_pause(
  487. globus_gram_job_manager_t * manager)
  488. {
  489. globus_gram_job_manager_log(
  490. manager,
  491. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  492. "event=gram.seg_pause.start "
  493. "level=TRACE "
  494. "count=%d "
  495. "\n",
  496. manager->seg_pause_count+1);
  497. GlobusGramJobManagerLock(manager);
  498. manager->seg_pause_count++;
  499. GlobusGramJobManagerUnlock(manager);
  500. globus_gram_job_manager_log(
  501. manager,
  502. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  503. "event=gram.seg_pause.end "
  504. "level=TRACE "
  505. "count=%d "
  506. "\n",
  507. manager->seg_pause_count);
  508. }
  509. /* globus_gram_job_manager_seg_pause() */
  510. void
  511. globus_gram_job_manager_seg_resume(
  512. globus_gram_job_manager_t * manager)
  513. {
  514. globus_result_t result;
  515. globus_scheduler_event_t * event;
  516. globus_gram_seg_resume_t * resume;
  517. globus_gram_job_manager_log(
  518. manager,
  519. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  520. "event=gram.seg_resume.start "
  521. "level=TRACE "
  522. "count=%d "
  523. "\n",
  524. manager->seg_pause_count-1);
  525. GlobusGramJobManagerLock(manager);
  526. manager->seg_pause_count--;
  527. if (manager->seg_pause_count == 0 &&
  528. !globus_fifo_empty(&manager->seg_event_queue))
  529. {
  530. resume = malloc(sizeof(globus_gram_seg_resume_t));
  531. if (resume != NULL)
  532. {
  533. globus_reltime_t delay;
  534. GlobusTimeReltimeSet(delay, 0, 0);
  535. globus_gram_job_manager_log(
  536. manager,
  537. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  538. "event=gram.seg_resume.info "
  539. "level=TRACE "
  540. "message=\"%s\" "
  541. "event_count=%d "
  542. "\n",
  543. "Creating resume callback struct",
  544. globus_fifo_size(&manager->seg_event_queue));
  545. resume->manager = manager;
  546. resume->events = globus_fifo_convert_to_list(
  547. &manager->seg_event_queue);
  548. result = globus_callback_register_oneshot(
  549. NULL,
  550. &delay,
  551. globus_l_seg_resume_callback,
  552. resume);
  553. if (result != GLOBUS_SUCCESS)
  554. {
  555. while (!globus_list_empty(resume->events))
  556. {
  557. event = globus_list_remove(&resume->events, resume->events);
  558. globus_scheduler_event_destroy(event);
  559. }
  560. }
  561. }
  562. }
  563. GlobusGramJobManagerUnlock(manager);
  564. globus_gram_job_manager_log(
  565. manager,
  566. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  567. "event=gram.seg_resume.end "
  568. "level=TRACE "
  569. "count=%d "
  570. "\n",
  571. manager->seg_pause_count);
  572. }
  573. /* globus_gram_job_manager_seg_resume() */
  574. static
  575. void
  576. globus_l_seg_resume_callback(
  577. void * user_arg)
  578. {
  579. globus_gram_seg_resume_t * resume = user_arg;
  580. globus_scheduler_event_t * event;
  581. globus_gram_jobmanager_request_t * request;
  582. int rc;
  583. globus_gram_job_manager_log(
  584. NULL,
  585. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  586. "event=gram.seg.resume_callback.start "
  587. "level=TRACE "
  588. "\n");
  589. while (!globus_list_empty(resume->events))
  590. {
  591. event = globus_list_remove(&resume->events, resume->events);
  592. GlobusGramJobManagerLock(resume->manager);
  593. rc = globus_gram_job_manager_add_reference_by_jobid(
  594. resume->manager,
  595. event->job_id,
  596. "SEG event",
  597. &request);
  598. if (rc != GLOBUS_SUCCESS)
  599. {
  600. globus_gram_job_manager_log(
  601. NULL,
  602. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  603. "event=gram.seg.resume_callback.end "
  604. "level=TRACE "
  605. "status=%d "
  606. "msg=\"%s\" "
  607. "\n",
  608. 0,
  609. "Ignoring unknown job id");
  610. GlobusGramJobManagerUnlock(resume->manager);
  611. globus_scheduler_event_destroy(event);
  612. }
  613. else
  614. {
  615. if (event->timestamp > request->manager->seg_last_timestamp)
  616. {
  617. request->manager->seg_last_timestamp = event->timestamp;
  618. }
  619. GlobusGramJobManagerUnlock(resume->manager);
  620. rc = globus_l_gram_deliver_event(
  621. request,
  622. event);
  623. globus_gram_job_manager_log(
  624. NULL,
  625. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  626. "event=gram.seg.resume_callback.end "
  627. "level=TRACE "
  628. "status=%d "
  629. "msg=\"%s\" "
  630. "\n",
  631. -rc,
  632. "Delivered event");
  633. }
  634. }
  635. }
  636. /* globus_l_seg_resume_callback() */
  637. static
  638. int
  639. globus_l_gram_deliver_event(
  640. globus_gram_jobmanager_request_t * request,
  641. globus_scheduler_event_t * event)
  642. {
  643. int rc;
  644. globus_reltime_t delay_time;
  645. GlobusGramJobManagerRequestLock(request);
  646. globus_gram_job_manager_request_log(
  647. request,
  648. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  649. "event=gram.seg_deliver_event.start "
  650. "level=TRACE "
  651. "gramid=%s "
  652. "jobid=\"%s\" "
  653. "state=%d "
  654. "jmstate=%s\n",
  655. request->job_contact_path,
  656. event->job_id,
  657. event->event_type,
  658. globus_i_gram_job_manager_state_strings[
  659. request->jobmanager_state]);
  660. /* Keep the state file's timestamp up to date so that
  661. * anything scrubbing the state files of old and dead
  662. * processes leaves it alone */
  663. if(request->job_state_file)
  664. {
  665. utime(request->job_state_file, NULL);
  666. }
  667. rc = globus_fifo_enqueue(&request->seg_event_queue, event);
  668. if (rc != GLOBUS_SUCCESS)
  669. {
  670. rc = GLOBUS_GRAM_PROTOCOL_ERROR_MALLOC_FAILED;
  671. globus_gram_job_manager_request_log(
  672. request,
  673. GLOBUS_GRAM_JOB_MANAGER_LOG_ERROR,
  674. "event=gram.seg_deliver_event.end "
  675. "level=ERROR "
  676. "gramid=%s "
  677. "jobid=\"%s\" "
  678. "state=%d "
  679. "jmstate=%s "
  680. "status=%d "
  681. "msg=\"%s\" "
  682. "reason=\"%s\" "
  683. "\n",
  684. request->job_contact_path,
  685. event->job_id,
  686. event->event_type,
  687. globus_i_gram_job_manager_state_strings[
  688. request->jobmanager_state],
  689. -rc,
  690. "Fifo enqueue failed",
  691. globus_gram_protocol_error_string(rc));
  692. goto event_enqueue_failed;
  693. }
  694. if (event->event_type == GLOBUS_SCHEDULER_EVENT_DONE ||
  695. event->event_type == GLOBUS_SCHEDULER_EVENT_FAILED)
  696. {
  697. (void) globus_gram_job_manager_unregister_job_id(
  698. request->manager,
  699. event->job_id);
  700. }
  701. if (request->jobmanager_state == GLOBUS_GRAM_JOB_MANAGER_STATE_POLL2)
  702. {
  703. GlobusTimeReltimeSet(delay_time, 0, 0);
  704. request->jobmanager_state = GLOBUS_GRAM_JOB_MANAGER_STATE_POLL1;
  705. rc = globus_gram_job_manager_state_machine_register(
  706. request->manager,
  707. request,
  708. &delay_time);
  709. if (rc != GLOBUS_SUCCESS)
  710. {
  711. request->jobmanager_state = GLOBUS_GRAM_JOB_MANAGER_STATE_POLL2;
  712. }
  713. }
  714. rc = GLOBUS_SUCCESS;
  715. globus_gram_job_manager_request_log(
  716. request,
  717. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  718. "event=gram.seg_deliver_event.end "
  719. "level=TRACE "
  720. "gramid=%s "
  721. "jobid=\"%s\" "
  722. "state=%d "
  723. "jmstate=%s "
  724. "status=%d "
  725. "\n",
  726. request->job_contact_path,
  727. event->job_id,
  728. event->event_type,
  729. globus_i_gram_job_manager_state_strings[request->jobmanager_state],
  730. 0);
  731. event_enqueue_failed:
  732. GlobusGramJobManagerRequestUnlock(request);
  733. return rc;
  734. }
  735. /* globus_l_gram_deliver_event() */
  736. static
  737. void
  738. globus_l_gram_fork_poll_callback(
  739. void * user_arg)
  740. {
  741. int rc;
  742. globus_gram_job_manager_t * manager = user_arg;
  743. globus_list_t * l;
  744. globus_scheduler_event_t * event;
  745. globus_list_t * events = NULL;
  746. globus_gram_jobmanager_request_t * request;
  747. int pid_count = 0;
  748. int done_count = 0;
  749. globus_list_t * job_id_list;
  750. globus_list_t * tmp;
  751. /* Walk the job id list, checking to see if the process has completed */
  752. rc = globus_gram_job_manager_get_job_id_list(
  753. manager,
  754. &job_id_list);
  755. for (tmp = job_id_list; tmp != NULL; tmp = globus_list_rest(tmp))
  756. {
  757. char * tok_end = NULL;
  758. char * pid_string;
  759. char * job_id_string;
  760. char * job_id_string_copy;
  761. job_id_string = globus_list_first(tmp);
  762. job_id_string_copy = strdup(job_id_string);
  763. if (job_id_string_copy == NULL)
  764. {
  765. continue;
  766. }
  767. pid_count = 0;
  768. done_count = 0;
  769. for (tok_end = NULL,
  770. pid_string = strtok_r(job_id_string, ",", &tok_end);
  771. pid_string != NULL;
  772. pid_string = strtok_r(NULL, ",", &tok_end))
  773. {
  774. char * end = NULL;
  775. unsigned long pid;
  776. pid_count++;
  777. errno = 0;
  778. pid = strtoul(pid_string, &end, 10);
  779. if ((pid == ULONG_MAX && errno != 0) || strlen(end) != 0)
  780. {
  781. continue;
  782. }
  783. if (kill((pid_t) pid, 0) < 0)
  784. {
  785. done_count++;
  786. }
  787. }
  788. if (pid_count == done_count && pid_count > 0)
  789. {
  790. /* Synthesize done event */
  791. event = malloc(sizeof(globus_scheduler_event_t));
  792. event->event_type = GLOBUS_SCHEDULER_EVENT_DONE;
  793. event->job_id = job_id_string_copy;
  794. event->timestamp = time(NULL);
  795. event->exit_code = 0;
  796. event->failure_code = 0;
  797. event->raw_event = NULL;
  798. globus_list_insert(&events, event);
  799. }
  800. else
  801. {
  802. free(job_id_string_copy);
  803. }
  804. free(job_id_string);
  805. }
  806. globus_list_free(job_id_list);
  807. /* Queue events in the request-specific SEG event queue */
  808. for (l = events; l != NULL; l = globus_list_rest(l))
  809. {
  810. event = globus_list_first(l);
  811. GlobusGramJobManagerLock(manager);
  812. rc = globus_gram_job_manager_add_reference_by_jobid(
  813. manager,
  814. event->job_id,
  815. "SEG event",
  816. &request);
  817. GlobusGramJobManagerUnlock(manager);
  818. if (rc == GLOBUS_SUCCESS)
  819. {
  820. rc = globus_l_gram_deliver_event(
  821. request,
  822. event);
  823. if (rc != GLOBUS_SUCCESS)
  824. {
  825. goto destroy_event;
  826. }
  827. }
  828. if (rc != GLOBUS_SUCCESS)
  829. {
  830. destroy_event:
  831. globus_scheduler_event_destroy(event);
  832. }
  833. }
  834. globus_list_free(events);
  835. }
  836. /* globus_l_gram_fork_poll_callback() */
  837. /**
  838. * @brief
  839. * Condor SEG-like periodic callback
  840. *
  841. * @details
  842. * This function is called periodically to check for condor state changes by
  843. * polling the condor log files for the jobs. This code assumes that
  844. * - The condor log files can be located in $job_state_file_dir/condor.$uniq_id
  845. * - The condor log files are in (pseudo) XML format
  846. * - The condor log files are owned by the user whose job is being logged
  847. * - The condor log files are removed when the job is cleaned up
  848. *
  849. * This function uses this algorithm to process the logs:
  850. * - Note current poll timestamp, last poll timestamp
  851. * - For each file that matches the file pattern
  852. * -- Check ownership, if not owned by user, skip file
  853. * -- Check if modified since last poll timestamp, if not changed, skip file
  854. * -- Lock File
  855. * -- Parse log file to generate SEG events (see globus_l_condor_parse_log())
  856. * - set last poll timestamp to current poll timestamp
  857. */
  858. static
  859. void
  860. globus_l_gram_condor_poll_callback(
  861. void * user_arg)
  862. {
  863. int rc;
  864. time_t last_poll_time;
  865. time_t poll_time;
  866. double poll_length;
  867. globus_reltime_t delay;
  868. globus_gram_job_manager_t * manager = user_arg;
  869. globus_scheduler_event_t * event;
  870. globus_fifo_t events;
  871. char * condor_log_data;
  872. globus_gram_job_manager_ref_t * ref;
  873. uint64_t uniq1, uniq2;
  874. char * path = NULL;
  875. GlobusGramJobManagerLock(manager);
  876. poll_time = time(NULL);
  877. last_poll_time = manager->seg_last_timestamp;
  878. globus_gram_job_manager_log(
  879. manager,
  880. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  881. "event=gram.condor_poll.start "
  882. "level=TRACE "
  883. "poll_time=%d "
  884. "last_poll=%d "
  885. "\n",
  886. poll_time,
  887. last_poll_time);
  888. rc = globus_fifo_init(&events);
  889. if (rc != GLOBUS_SUCCESS)
  890. {
  891. poll_time = last_poll_time;
  892. }
  893. for (ref = globus_hashtable_first(&manager->request_hash);
  894. ref != NULL;
  895. ref = globus_hashtable_next(&manager->request_hash))
  896. {
  897. if (ref->request &&
  898. ref->request->job_id_string &&
  899. *ref->request->job_id_string == 0)
  900. {
  901. /* Skip jobs which have no outstanding subjobs to poll */
  902. continue;
  903. }
  904. rc = sscanf(ref->key, "/%" SCNu64 "/%" SCNu64 "/", &uniq1, &uniq2);
  905. if (rc != 2)
  906. {
  907. globus_gram_job_manager_log(
  908. manager,
  909. GLOBUS_GRAM_JOB_MANAGER_LOG_WARN,
  910. "event=gram.condor_poll.info "
  911. "level=WARN "
  912. "msg=\"%s\" "
  913. "key=\"%s\" "
  914. "\n",
  915. "Unexpected key format",
  916. ref->key);
  917. continue;
  918. }
  919. path = globus_common_create_string("%s/condor.%"PRIu64".%"PRIu64,
  920. manager->config->job_state_file_dir,
  921. uniq1, uniq2);
  922. if (path == NULL)
  923. {
  924. continue;
  925. }
  926. rc = globus_l_condor_read_log(
  927. manager,
  928. path,
  929. ref->seg_last_size,
  930. &condor_log_data);
  931. /* condor_log_data is null if the file hasn't changed since
  932. * seg_last_size or an error happened.
  933. */
  934. if (rc != GLOBUS_SUCCESS || condor_log_data == NULL)
  935. {
  936. goto read_failed;
  937. }
  938. rc = globus_l_condor_parse_log(
  939. condor_log_data,
  940. ref,
  941. &events);
  942. free(condor_log_data);
  943. read_failed:
  944. free(path);
  945. path = NULL;
  946. }
  947. /*
  948. * Adjust poll interval based on polling time. If things are going slowly,
  949. * wait for a multiple of the poll time, otherwise reset the clock to
  950. * 5 seconds to avoid globus_callback scheduling this to run fewer than 5
  951. * seconds from now.
  952. */
  953. poll_length = difftime(time(NULL), poll_time);
  954. if (poll_length > 1.0)
  955. {
  956. GlobusTimeReltimeSet(delay, (time_t) (poll_length * 5), 0);
  957. }
  958. else
  959. {
  960. GlobusTimeReltimeSet(delay, (time_t) 5, 0);
  961. }
  962. globus_callback_adjust_period(manager->fork_callback_handle, &delay);
  963. GlobusGramJobManagerUnlock(manager);
  964. while (!globus_fifo_empty(&events))
  965. {
  966. event = globus_fifo_dequeue(&events);
  967. globus_l_gram_seg_event_callback(manager, event);
  968. globus_scheduler_event_destroy(event);
  969. }
  970. globus_fifo_destroy(&events);
  971. GlobusGramJobManagerLock(manager);
  972. if (poll_time > manager->seg_last_timestamp)
  973. {
  974. manager->seg_last_timestamp = poll_time;
  975. }
  976. GlobusGramJobManagerUnlock(manager);
  977. globus_gram_job_manager_log(
  978. manager,
  979. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  980. "event=gram.condor_poll.end "
  981. "level=TRACE "
  982. "\n");
  983. }
  984. /* globus_i_gram_condor_poll_callback() */
  985. /**
  986. * @brief Generate SEG events for condor log events in a data buffer
  987. *
  988. * @details
  989. * This function uses a couple of regular expressions to pull out the
  990. * data from a (pseudo)XML condor log. This parser is adapted from the
  991. * condor SEG implement from GT4. The log messages look something like this
  992. * &lt;c>
  993. * &lt;<a n="ATTRIBUTE-NAME">&lt;b v="t|f"/>|&lt;s>STRING&lt;/s>|&lt;i>INTEGER&lt;/i>|&lt;r>REAL&lt;/r>
  994. * &lt;/c>
  995. *
  996. * We are only interested in attributes directly related to SEG events:
  997. * - EventTypeNumber
  998. * - EventTime
  999. * - Cluster
  1000. * - Proc
  1001. * - Subproc
  1002. * - TerminatedNormally
  1003. * - ReturnValue
  1004. *
  1005. * The parser pulls out values for all of the children of a c element, then
  1006. * creates an event from it and pushes it onto the events fifo.
  1007. */
  1008. static
  1009. int
  1010. globus_l_condor_parse_log(
  1011. const char * data,
  1012. globus_gram_job_manager_ref_t * ref,
  1013. globus_fifo_t * events)
  1014. {
  1015. static int once = 0;
  1016. static regex_t outer_re, inner_re;
  1017. regmatch_t matches[8];
  1018. const char * p;
  1019. int event_type_number;
  1020. const char * event_time;
  1021. int cluster;
  1022. int proc;
  1023. int subproc;
  1024. globus_bool_t terminated_normally;
  1025. int return_value = 0;
  1026. struct tm event_tm;
  1027. time_t event_stamp;
  1028. int rc;
  1029. globus_off_t parsed_length = 0;
  1030. globus_scheduler_event_t * event;
  1031. enum condor_attr_e
  1032. {
  1033. DONTCARE,
  1034. EVENT_TYPE_NUMBER,
  1035. EVENT_TIME,
  1036. CLUSTER,
  1037. PROC,
  1038. SUBPROC,
  1039. TERMINATED_NORMALLY,
  1040. RETURN_VALUE
  1041. } condor_attr;
  1042. typedef enum
  1043. {
  1044. CONDOR_STRING,
  1045. CONDOR_INTEGER,
  1046. CONDOR_BOOLEAN,
  1047. CONDOR_REAL
  1048. } condor_parse_type_t;
  1049. union
  1050. {
  1051. condor_parse_type_t type;
  1052. struct
  1053. {
  1054. condor_parse_type_t type;
  1055. const char * s;
  1056. size_t len;
  1057. } s;
  1058. struct
  1059. {
  1060. condor_parse_type_t type;
  1061. int i;
  1062. } i;
  1063. struct
  1064. {
  1065. condor_parse_type_t type;
  1066. globus_bool_t b;
  1067. } b;
  1068. struct
  1069. {
  1070. condor_parse_type_t type;
  1071. float r;
  1072. } r;
  1073. } pu;
  1074. if (!once)
  1075. {
  1076. once = 1;
  1077. rc = regcomp(&outer_re,
  1078. "(<c>((<[^/]|</[^c]>|[^<])*)</c>)",
  1079. REG_EXTENDED);
  1080. assert (rc == 0);
  1081. rc = regcomp(&inner_re,
  1082. "^([[:space:]]*"
  1083. "<a n=\"([[:alpha:]]+)\">[[:space:]]*"
  1084. "(<(b) v=\"([tf])\"/>|<([sir])>([^<]*)</[sir]>)"
  1085. "</a>[[:space:]]*)",
  1086. REG_EXTENDED);
  1087. assert(rc == 0);
  1088. }
  1089. p = data + ref->seg_last_size;
  1090. parsed_length = ref->seg_last_size;
  1091. while ((rc = regexec(
  1092. &outer_re, p, (int) (sizeof(matches)/sizeof(matches[0])),
  1093. matches, 0)) == 0)
  1094. {
  1095. const char * e = p + matches[1].rm_eo;
  1096. regoff_t event_length = matches[0].rm_eo - matches[0].rm_so;
  1097. p = p + matches[2].rm_so;
  1098. while ((rc = regexec(&inner_re, p,
  1099. (int) (sizeof(matches)/sizeof(matches[0])),
  1100. matches, 0)) == 0)
  1101. {
  1102. size_t matchlen;
  1103. const char * match;
  1104. /* Regular expression match indices as xpath strings
  1105. * 1: a
  1106. * 2: a/@n
  1107. * 3: a/b|a/s/|a/i|a/r
  1108. * 4: a/b
  1109. * 5: a/b/@v
  1110. * 6: a/s/local-name()|a/i/local-name()|a/r/local-name()
  1111. * 7: a/s/text()|a/i/text()|a/r/text()
  1112. */
  1113. matchlen = (size_t) (matches[2].rm_eo - matches[2].rm_so);
  1114. match = p + matches[2].rm_so;
  1115. if (strncmp(match, "EventTypeNumber", matchlen) == 0)
  1116. {
  1117. condor_attr = EVENT_TYPE_NUMBER;
  1118. }
  1119. else if (strncmp(match, "EventTime", matchlen) == 0)
  1120. {
  1121. condor_attr = EVENT_TIME;
  1122. }
  1123. else if (strncmp(match, "Cluster", matchlen) == 0)
  1124. {
  1125. condor_attr = CLUSTER;
  1126. }
  1127. else if (strncmp(match, "Proc", matchlen) == 0)
  1128. {
  1129. condor_attr = PROC;
  1130. }
  1131. else if (strncmp(match, "Subproc", matchlen) == 0)
  1132. {
  1133. condor_attr = SUBPROC;
  1134. }
  1135. else if (strncmp(match, "TerminatedNormally", matchlen) == 0)
  1136. {
  1137. condor_attr = TERMINATED_NORMALLY;
  1138. }
  1139. else if (strncmp(match, "ReturnValue", matchlen) == 0)
  1140. {
  1141. condor_attr = RETURN_VALUE;
  1142. }
  1143. else
  1144. {
  1145. condor_attr = DONTCARE;
  1146. }
  1147. matchlen = (size_t) (matches[4].rm_eo - matches[4].rm_so);
  1148. match = p + matches[4].rm_so;
  1149. if (matches[4].rm_so != -1)
  1150. {
  1151. if (strncmp(match, "b", matchlen) == 0)
  1152. {
  1153. pu.type = CONDOR_BOOLEAN;
  1154. matchlen = (size_t) (matches[5].rm_eo - matches[5].rm_so);
  1155. match = p + matches[5].rm_so;
  1156. if (strncmp(match, "t", matchlen) == 0)
  1157. {
  1158. pu.b.b = GLOBUS_TRUE;
  1159. }
  1160. else
  1161. {
  1162. pu.b.b = GLOBUS_FALSE;
  1163. }
  1164. }
  1165. }
  1166. matchlen = (size_t) (matches[6].rm_eo - matches[6].rm_so);
  1167. match = p + matches[6].rm_so;
  1168. if (matches[6].rm_so != -1)
  1169. {
  1170. if (strncmp(match, "s", matchlen) == 0)
  1171. {
  1172. pu.type = CONDOR_STRING;
  1173. pu.s.s = p + matches[7].rm_so;
  1174. pu.s.len = (size_t) (matches[7].rm_eo - matches[7].rm_so);
  1175. }
  1176. else if (strncmp(match, "i", matchlen) == 0)
  1177. {
  1178. pu.type = CONDOR_INTEGER;
  1179. pu.i.i = atoi(p + matches[7].rm_so);
  1180. }
  1181. else if (strncmp(match, "r", matchlen) == 0)
  1182. {
  1183. pu.type = CONDOR_REAL;
  1184. sscanf(p + matches[7].rm_so, "%f", &pu.r.r);
  1185. }
  1186. }
  1187. switch (condor_attr)
  1188. {
  1189. case EVENT_TYPE_NUMBER:
  1190. globus_assert (pu.type == CONDOR_INTEGER);
  1191. event_type_number = pu.i.i;
  1192. break;
  1193. case EVENT_TIME:
  1194. globus_assert (pu.type == CONDOR_STRING);
  1195. event_time = pu.s.s;
  1196. globus_strptime(
  1197. (char *) event_time,
  1198. "%Y-%m-%dT%H:%M:%S",
  1199. &event_tm);
  1200. event_stamp = mktime(&event_tm);
  1201. break;
  1202. case CLUSTER:
  1203. globus_assert (pu.type == CONDOR_INTEGER);
  1204. cluster = pu.i.i;
  1205. break;
  1206. case PROC:
  1207. globus_assert (pu.type == CONDOR_INTEGER);
  1208. proc = pu.i.i;
  1209. break;
  1210. case SUBPROC:
  1211. globus_assert (pu.type == CONDOR_INTEGER);
  1212. subproc = pu.i.i;
  1213. break;
  1214. case TERMINATED_NORMALLY:
  1215. globus_assert (pu.type == CONDOR_BOOLEAN);
  1216. terminated_normally = pu.b.b;
  1217. break;
  1218. case RETURN_VALUE:
  1219. globus_assert (pu.type == CONDOR_INTEGER);
  1220. return_value = pu.i.i;
  1221. break;
  1222. case DONTCARE:
  1223. default:
  1224. break;
  1225. }
  1226. p = p + matches[1].rm_eo;
  1227. }
  1228. p = e;
  1229. parsed_length += event_length;
  1230. event = NULL;
  1231. switch (event_type_number)
  1232. {
  1233. case 0: /* SubmitEvent */
  1234. event = calloc(1, sizeof(globus_scheduler_event_t));
  1235. event->event_type = GLOBUS_SCHEDULER_EVENT_PENDING;
  1236. event->job_id = globus_common_create_string("%03d.%03d.%03d",
  1237. cluster, proc, subproc);
  1238. event->timestamp = event_stamp;
  1239. globus_fifo_enqueue(events, event);
  1240. break;
  1241. case 1: /* ExecuteEvent */
  1242. event = calloc(1, sizeof(globus_scheduler_event_t));
  1243. event->event_type = GLOBUS_SCHEDULER_EVENT_ACTIVE;
  1244. event->job_id = globus_common_create_string("%03d.%03d.%03d",
  1245. cluster, proc, subproc);
  1246. event->timestamp = event_stamp;
  1247. globus_fifo_enqueue(events, event);
  1248. break;
  1249. case 5: /* JobTerminatedEvent */
  1250. if (terminated_normally)
  1251. {
  1252. event = calloc(1, sizeof(globus_scheduler_event_t));
  1253. event->event_type = GLOBUS_SCHEDULER_EVENT_DONE;
  1254. event->job_id = globus_common_create_string("%03d.%03d.%03d",
  1255. cluster, proc, subproc);
  1256. event->timestamp = event_stamp;
  1257. event->exit_code = return_value;
  1258. globus_fifo_enqueue(events, event);
  1259. }
  1260. else
  1261. {
  1262. case 9: /* JobAbortedEvent */
  1263. event = calloc(1, sizeof(globus_scheduler_event_t));
  1264. event->event_type = GLOBUS_SCHEDULER_EVENT_FAILED;
  1265. event->job_id = globus_common_create_string("%03d.%03d.%03d",
  1266. cluster, proc, subproc);
  1267. event->timestamp = event_stamp;
  1268. event->failure_code = return_value;
  1269. globus_fifo_enqueue(events, event);
  1270. }
  1271. break;
  1272. }
  1273. if (event && event->timestamp > ref->seg_last_timestamp)
  1274. {
  1275. ref->seg_last_timestamp = event->timestamp;
  1276. }
  1277. }
  1278. ref->seg_last_size = parsed_length;
  1279. return 0;
  1280. }
  1281. /* globus_l_condor_parse_log() */
  1282. static
  1283. int
  1284. globus_l_condor_read_log(
  1285. globus_gram_job_manager_t *manager,
  1286. const char *path,
  1287. size_t last_size,
  1288. char **data)
  1289. {
  1290. int condor_log_fd;
  1291. char *condor_log_data;
  1292. struct stat st;
  1293. struct flock flock_data;
  1294. int rc = GLOBUS_SUCCESS;
  1295. *data = NULL;
  1296. condor_log_fd = open(path, O_RDONLY);
  1297. if (condor_log_fd < 0)
  1298. {
  1299. globus_gram_job_manager_log(
  1300. manager,
  1301. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  1302. "event=gram.condor_poll.info "
  1303. "level=TRACE "
  1304. "message=\"%s\" "
  1305. "errno=%d "
  1306. "errstr=\"%s\" "
  1307. "\n",
  1308. "open failed",
  1309. errno,
  1310. strerror(errno));
  1311. rc = GLOBUS_GRAM_PROTOCOL_ERROR_NO_STATE_FILE;
  1312. goto open_failed;
  1313. }
  1314. rc = fstat(condor_log_fd, &st);
  1315. if (rc != GLOBUS_SUCCESS)
  1316. {
  1317. globus_gram_job_manager_log(
  1318. manager,
  1319. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  1320. "event=gram.condor_poll.info "
  1321. "level=TRACE "
  1322. "message=\"%s\" "
  1323. "errno=%d "
  1324. "errstr=\"%s\" "
  1325. "\n",
  1326. "fstat failed",
  1327. errno,
  1328. strerror(errno));
  1329. rc = GLOBUS_GRAM_PROTOCOL_ERROR_READING_STATE_FILE;
  1330. goto fstat_failed;
  1331. }
  1332. if (st.st_uid != getuid())
  1333. {
  1334. globus_gram_job_manager_log(
  1335. manager,
  1336. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  1337. "event=gram.condor_poll.info "
  1338. "level=TRACE "
  1339. "message=\"%s\" "
  1340. "uid.me=%ld "
  1341. "uid.file=%ld "
  1342. "\n",
  1343. "uid mismatch",
  1344. (long) getuid(),
  1345. (long) st.st_uid);
  1346. rc = GLOBUS_GRAM_PROTOCOL_ERROR_READING_STATE_FILE;
  1347. goto uid_mismatch;
  1348. }
  1349. if (st.st_size <= last_size)
  1350. {
  1351. globus_gram_job_manager_log(
  1352. manager,
  1353. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  1354. "event=gram.condor_poll.info "
  1355. "level=TRACE "
  1356. "message=\"%s\" "
  1357. "file=\"%s\" "
  1358. "size.last_poll=%lld "
  1359. "size.file=%lld "
  1360. "\n",
  1361. "file hasn't grown since last poll",
  1362. path,
  1363. (long long) last_size,
  1364. (long long) st.st_size);
  1365. goto not_grown;
  1366. }
  1367. flock_data.l_type = F_RDLCK;
  1368. flock_data.l_whence = SEEK_SET;
  1369. flock_data.l_start = 0;
  1370. flock_data.l_len = 0;
  1371. flock_data.l_pid = getpid();
  1372. globus_gram_job_manager_log(
  1373. manager,
  1374. GLOBUS_GRAM_JOB_MANAGER_LOG_TRACE,
  1375. "event=gram.condor_poll.info "
  1376. "level=TRACE "
  1377. "message=\"%s\" "
  1378. "file=\"%s\" "
  1379. "\n",
  1380. "Checking file for new events",
  1381. path);
  1382. do
  1383. {
  1384. rc = fcntl(condor_log_fd, F_SETLKW, &flock_data);
  1385. if (rc != 0 && errno != EINTR)
  1386. {
  1387. goto fcntl_lock_failed;
  1388. }
  1389. } while (rc == -1);
  1390. {
  1391. ssize_t read_res;
  1392. size_t amt_to_read = st.st_size;
  1393. size_t amt_read = 0;
  1394. condor_log_data = malloc((size_t) st.st_size + 1);
  1395. if (condor_log_data == NULL)
  1396. {
  1397. globus_gram_job_manager_log(
  1398. manager,
  1399. GLOBUS_GRAM_JOB_MANAGER_LOG_WARN,
  1400. "event=gram.condor_poll.info "
  1401. "level=WARN "
  1402. "message=\"%s\" "
  1403. "filename=\"%s\" "
  1404. "size=%llu "
  1405. "errno=%d "
  1406. "reason=%s\n",
  1407. "Error allocating memory for condor log",
  1408. path,
  1409. (unsigned long long) st.st_size,
  1410. errno,
  1411. strerror(errno));
  1412. rc = GLOBUS_GRAM_PROTOCOL_ERROR_MALLOC_FAILED;
  1413. goto malloc_data_failed;
  1414. }
  1415. condor_log_data[(size_t) st.st_size] = 0;
  1416. while (amt_to_read > amt_read)
  1417. {
  1418. read_res = read(
  1419. condor_log_fd,
  1420. condor_log_data + amt_read,
  1421. amt_to_read - amt_read);
  1422. if (read_res < 0 && errno == EINTR)
  1423. {
  1424. continue;
  1425. }
  1426. else if (read_res > 0)
  1427. {
  1428. amt_read += read_res;
  1429. }
  1430. else
  1431. {
  1432. /* Some other error or short read */
  1433. break;
  1434. }
  1435. }
  1436. if (amt_to_read != amt_read)
  1437. {
  1438. globus_gram_job_manager_log(
  1439. manager,
  1440. GLOBUS_GRAM_JOB_MANAGER_LOG_WARN,
  1441. "event=gram.condor_poll.info "
  1442. "level=WARN "
  1443. "message=\"%s\" "
  1444. "filename=\"%s\" "
  1445. "size=%llu "
  1446. "amt_read=%llu "
  1447. "errno=%d "
  1448. "reason=%s\n",
  1449. "Error reading condor log",

Large files files are truncated, but you can click here to view the full file