PageRenderTime 57ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/src/condor_dagman/dagman_main.cpp

https://github.com/clalancette/condor-dcloud
C++ | 1312 lines | 931 code | 173 blank | 208 comment | 221 complexity | 5ddf7bd49486119cddc7c1b278cebdba MD5 | raw file
Possible License(s): Apache-2.0
  1. /***************************************************************
  2. *
  3. * Copyright (C) 1990-2007, Condor Team, Computer Sciences Department,
  4. * University of Wisconsin-Madison, WI.
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the "License"); you
  7. * may not use this file except in compliance with the License. You may
  8. * obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. *
  18. ***************************************************************/
  19. #include "condor_common.h"
  20. #include "condor_config.h"
  21. #include "condor_daemon_core.h"
  22. #include "condor_string.h"
  23. #include "subsystem_info.h"
  24. #include "basename.h"
  25. #include "setenv.h"
  26. #include "dag.h"
  27. #include "debug.h"
  28. #include "parse.h"
  29. #include "my_username.h"
  30. #include "condor_environ.h"
  31. #include "dagman_main.h"
  32. #include "dagman_commands.h"
  33. #include "dagman_multi_dag.h"
  34. #include "util.h"
  35. #include "condor_getcwd.h"
  36. #include "condor_version.h"
  37. #include "subsystem_info.h"
  38. void ExitSuccess();
  39. // From condor_utils/condor_config.C
  40. // Note: these functions are declared 'extern "C"' where they're
  41. // implemented; if we don't do that here we get a link failure
  42. // (I think because of the name mangling). wenger 2007-02-09.
  43. extern "C" void process_config_source( char* file, char* name,
  44. char* host, int required );
  45. extern "C" bool is_piped_command(const char* filename);
  46. //---------------------------------------------------------------------------
  47. DECL_SUBSYSTEM( "DAGMAN", SUBSYSTEM_TYPE_DAGMAN );
  48. static char* lockFileName = NULL;
  49. static Dagman dagman;
  50. //---------------------------------------------------------------------------
  51. static void Usage() {
  52. debug_printf( DEBUG_SILENT, "\nUsage: condor_dagman -f -t -l .\n"
  53. "\t\t-Lockfile <NAME.dag.lock>\n"
  54. "\t\t-Dag <NAME.dag>\n"
  55. "\t\t-CsdVersion <version string>\n"
  56. "\t\t[-Debug <level>]\n"
  57. "\t\t[-Rescue <Rescue.dag>]\n"
  58. "\t\t[-MaxIdle <int N>]\n"
  59. "\t\t[-MaxJobs <int N>]\n"
  60. "\t\t[-MaxPre <int N>]\n"
  61. "\t\t[-MaxPost <int N>]\n"
  62. "\t\t[-WaitForDebug]\n"
  63. "\t\t[-NoEventChecks]\n"
  64. "\t\t[-AllowLogError]\n"
  65. "\t\t[-UseDagDir]\n"
  66. "\t\t[-AutoRescue <0|1>]\n"
  67. "\t\t[-DoRescueFrom <int N>]\n"
  68. "\t\t[-AllowVersionMismatch]\n"
  69. "\t\t[-DumpRescue]\n"
  70. "\t\t[-Verbose]\n"
  71. "\t\t[-Force]\n"
  72. "\t\t[-Notification <never|always|complete|error>]\n"
  73. "\t\t[-Dagman <dagman_executable>]\n"
  74. "\t\t[-Outfile_dir <directory>]\n"
  75. "\t\t[-Update_submit]\n"
  76. "\t\t[-Import_env]\n"
  77. "\twhere NAME is the name of your DAG.\n"
  78. "\tdefault -Debug is -Debug %d\n", DEBUG_NORMAL);
  79. DC_Exit( EXIT_ERROR );
  80. }
  81. //---------------------------------------------------------------------------
  82. Dagman::Dagman() :
  83. dag (NULL),
  84. maxIdle (0),
  85. maxJobs (0),
  86. maxPreScripts (0),
  87. maxPostScripts (0),
  88. rescueFileToWrite (NULL),
  89. paused (false),
  90. condorSubmitExe (NULL),
  91. condorRmExe (NULL),
  92. storkSubmitExe (NULL),
  93. storkRmExe (NULL),
  94. submit_delay (0),
  95. max_submit_attempts (6),
  96. max_submits_per_interval (5), // so Coverity is happy
  97. m_user_log_scan_interval (5),
  98. primaryDagFile (""),
  99. multiDags (false),
  100. startup_cycle_detect (false), // so Coverity is happy
  101. allowLogError (false),
  102. useDagDir (false),
  103. allow_events (CheckEvents::ALLOW_NONE), // so Coverity is happy
  104. retrySubmitFirst (true), // so Coverity is happy
  105. retryNodeFirst (false), // so Coverity is happy
  106. mungeNodeNames (true), // so Coverity is happy
  107. prohibitMultiJobs (false), // so Coverity is happy
  108. abortDuplicates (true), // so Coverity is happy
  109. submitDepthFirst (false), // so Coverity is happy
  110. abortOnScarySubmit (true), // so Coverity is happy
  111. pendingReportInterval (10 * 60), // 10 minutes
  112. _dagmanConfigFile (NULL), // so Coverity is happy
  113. autoRescue(true),
  114. doRescueFrom(0),
  115. maxRescueDagNum(MAX_RESCUE_DAG_DEFAULT),
  116. rescueFileToRun(""),
  117. dumpRescueDag(false),
  118. _defaultNodeLog(NULL),
  119. _generateSubdagSubmits(true),
  120. _maxJobHolds(100)
  121. {
  122. debug_level = DEBUG_VERBOSE; // Default debug level is verbose output
  123. }
  124. Dagman::~Dagman()
  125. {
  126. // check if dag is NULL, since we may have
  127. // already delete'd it in the dag.CleanUp() method.
  128. if ( dag != NULL ) {
  129. delete dag;
  130. dag = NULL;
  131. }
  132. }
  133. //
  134. // In Config() we get DAGMan-related configuration values. This
  135. // is a three-step process:
  136. // 1. Get the name of the DAGMan-specific config file (if any).
  137. // 2. If there is a DAGMan-specific config file, process it so
  138. // that its values are added to the configuration.
  139. // 3. Get the values we want from the configuration.
  140. //
  141. bool
  142. Dagman::Config()
  143. {
  144. int debug_cache_size = (1024*1024)*5; // 5 MB
  145. bool debug_cache_enabled = false;
  146. // Note: debug_printfs are DEBUG_NORMAL here because when we
  147. // get here we haven't processed command-line arguments yet.
  148. // Get and process the DAGMan-specific config file (if any)
  149. // before getting any of the other parameters.
  150. _dagmanConfigFile = param( "DAGMAN_CONFIG_FILE" );
  151. if ( _dagmanConfigFile ) {
  152. debug_printf( DEBUG_NORMAL, "Using DAGMan config file: %s\n",
  153. _dagmanConfigFile );
  154. // We do this test here because the corresponding error
  155. // message from the config code doesn't show up in dagman.out.
  156. if ( access( _dagmanConfigFile, R_OK ) != 0 &&
  157. !is_piped_command( _dagmanConfigFile ) ) {
  158. debug_printf( DEBUG_QUIET,
  159. "ERROR: Can't read DAGMan config file: %s\n",
  160. _dagmanConfigFile );
  161. DC_Exit( EXIT_ERROR );
  162. }
  163. process_config_source( _dagmanConfigFile, "DAGMan config",
  164. NULL, true );
  165. }
  166. debug_level = (debug_level_t)param_integer( "DAGMAN_VERBOSITY",
  167. debug_level, DEBUG_SILENT, DEBUG_DEBUG_4 );
  168. debug_printf( DEBUG_NORMAL, "DAGMAN_VERBOSITY setting: %d\n",
  169. debug_level );
  170. debug_cache_size =
  171. param_integer( "DAGMAN_DEBUG_CACHE_SIZE", debug_cache_size,
  172. 0, INT_MAX);
  173. debug_printf( DEBUG_NORMAL, "DAGMAN_DEBUG_CACHE_SIZE setting: %d\n",
  174. debug_cache_size );
  175. debug_cache_enabled =
  176. param_boolean( "DAGMAN_DEBUG_CACHE_ENABLE", debug_cache_enabled );
  177. debug_printf( DEBUG_NORMAL, "DAGMAN_DEBUG_CACHE_ENABLE setting: %s\n",
  178. debug_cache_enabled?"True":"False" );
  179. submit_delay = param_integer( "DAGMAN_SUBMIT_DELAY", submit_delay, 0, 60 );
  180. debug_printf( DEBUG_NORMAL, "DAGMAN_SUBMIT_DELAY setting: %d\n",
  181. submit_delay );
  182. max_submit_attempts =
  183. param_integer( "DAGMAN_MAX_SUBMIT_ATTEMPTS", max_submit_attempts,
  184. 1, 16 );
  185. debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_SUBMIT_ATTEMPTS setting: %d\n",
  186. max_submit_attempts );
  187. startup_cycle_detect =
  188. param_boolean( "DAGMAN_STARTUP_CYCLE_DETECT", startup_cycle_detect );
  189. debug_printf( DEBUG_NORMAL, "DAGMAN_STARTUP_CYCLE_DETECT setting: %s\n",
  190. startup_cycle_detect ? "True" : "False" );
  191. max_submits_per_interval =
  192. param_integer( "DAGMAN_MAX_SUBMITS_PER_INTERVAL",
  193. max_submits_per_interval, 1, 1000 );
  194. debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: %d\n",
  195. max_submits_per_interval );
  196. m_user_log_scan_interval =
  197. param_integer( "DAGMAN_USER_LOG_SCAN_INTERVAL",
  198. m_user_log_scan_interval, 1, INT_MAX);
  199. debug_printf( DEBUG_NORMAL, "DAGMAN_USER_LOG_SCAN_INTERVAL setting: %d\n",
  200. m_user_log_scan_interval );
  201. // Event checking setup...
  202. // We want to default to allowing the terminated/aborted
  203. // combination (that's what we've defaulted to in the past).
  204. // Okay, we also want to allow execute before submit because
  205. // we've run into that, and since DAGMan doesn't really care
  206. // about the execute events, it shouldn't abort the DAG.
  207. // And we further want to allow two terminated events for a
  208. // single job because people are seeing that with Globus
  209. // jobs!!
  210. allow_events = CheckEvents::ALLOW_TERM_ABORT |
  211. CheckEvents::ALLOW_EXEC_BEFORE_SUBMIT |
  212. CheckEvents::ALLOW_DOUBLE_TERMINATE |
  213. CheckEvents::ALLOW_DUPLICATE_EVENTS;
  214. // If the old DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION param is set,
  215. // we also allow extra runs.
  216. // Note: this parameter is probably only used by CDF, and only
  217. // really needed until they update all their systems to 6.7.3
  218. // or later (not 6.7.3 pre-release), which fixes the "double-run"
  219. // bug.
  220. bool allowExtraRuns = param_boolean(
  221. "DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION", false );
  222. if ( allowExtraRuns ) {
  223. allow_events |= CheckEvents::ALLOW_RUN_AFTER_TERM;
  224. debug_printf( DEBUG_NORMAL, "Warning: "
  225. "DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION "
  226. "is deprecated -- used DAGMAN_ALLOW_EVENTS instead\n" );
  227. }
  228. // Now get the new DAGMAN_ALLOW_EVENTS value -- that can override
  229. // all of the previous stuff.
  230. allow_events = param_integer("DAGMAN_ALLOW_EVENTS", allow_events);
  231. debug_printf( DEBUG_NORMAL, "allow_events ("
  232. "DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS"
  233. ") setting: %d\n", allow_events );
  234. // ...end of event checking setup.
  235. retrySubmitFirst = param_boolean( "DAGMAN_RETRY_SUBMIT_FIRST",
  236. retrySubmitFirst );
  237. debug_printf( DEBUG_NORMAL, "DAGMAN_RETRY_SUBMIT_FIRST setting: %s\n",
  238. retrySubmitFirst ? "True" : "False" );
  239. retryNodeFirst = param_boolean( "DAGMAN_RETRY_NODE_FIRST",
  240. retryNodeFirst );
  241. debug_printf( DEBUG_NORMAL, "DAGMAN_RETRY_NODE_FIRST setting: %s\n",
  242. retryNodeFirst ? "True" : "False" );
  243. maxIdle =
  244. param_integer( "DAGMAN_MAX_JOBS_IDLE", maxIdle, 0, INT_MAX );
  245. debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_JOBS_IDLE setting: %d\n",
  246. maxIdle );
  247. maxJobs =
  248. param_integer( "DAGMAN_MAX_JOBS_SUBMITTED", maxJobs, 0, INT_MAX );
  249. debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_JOBS_SUBMITTED setting: %d\n",
  250. maxJobs );
  251. maxPreScripts = param_integer( "DAGMAN_MAX_PRE_SCRIPTS", maxPreScripts,
  252. 0, INT_MAX );
  253. debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_PRE_SCRIPTS setting: %d\n",
  254. maxPreScripts );
  255. maxPostScripts = param_integer( "DAGMAN_MAX_POST_SCRIPTS", maxPostScripts,
  256. 0, INT_MAX );
  257. debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_POST_SCRIPTS setting: %d\n",
  258. maxPostScripts );
  259. allowLogError = param_boolean( "DAGMAN_ALLOW_LOG_ERROR", allowLogError );
  260. debug_printf( DEBUG_NORMAL, "DAGMAN_ALLOW_LOG_ERROR setting: %s\n",
  261. allowLogError ? "True" : "False" );
  262. mungeNodeNames = param_boolean( "DAGMAN_MUNGE_NODE_NAMES",
  263. mungeNodeNames );
  264. debug_printf( DEBUG_NORMAL, "DAGMAN_MUNGE_NODE_NAMES setting: %s\n",
  265. mungeNodeNames ? "True" : "False" );
  266. prohibitMultiJobs = param_boolean( "DAGMAN_PROHIBIT_MULTI_JOBS",
  267. prohibitMultiJobs );
  268. debug_printf( DEBUG_NORMAL, "DAGMAN_PROHIBIT_MULTI_JOBS setting: %s\n",
  269. prohibitMultiJobs ? "True" : "False" );
  270. submitDepthFirst = param_boolean( "DAGMAN_SUBMIT_DEPTH_FIRST",
  271. submitDepthFirst );
  272. debug_printf( DEBUG_NORMAL, "DAGMAN_SUBMIT_DEPTH_FIRST setting: %s\n",
  273. submitDepthFirst ? "True" : "False" );
  274. free( condorSubmitExe );
  275. condorSubmitExe = param( "DAGMAN_CONDOR_SUBMIT_EXE" );
  276. if( !condorSubmitExe ) {
  277. condorSubmitExe = strdup( "condor_submit" );
  278. ASSERT( condorSubmitExe );
  279. }
  280. free( condorRmExe );
  281. condorRmExe = param( "DAGMAN_CONDOR_RM_EXE" );
  282. if( !condorRmExe ) {
  283. condorRmExe = strdup( "condor_rm" );
  284. ASSERT( condorRmExe );
  285. }
  286. free( storkSubmitExe );
  287. storkSubmitExe = param( "DAGMAN_STORK_SUBMIT_EXE" );
  288. if( !storkSubmitExe ) {
  289. storkSubmitExe = strdup( "stork_submit" );
  290. ASSERT( storkSubmitExe );
  291. }
  292. free( storkRmExe );
  293. storkRmExe = param( "DAGMAN_STORK_RM_EXE" );
  294. if( !storkRmExe ) {
  295. storkRmExe = strdup( "stork_rm" );
  296. ASSERT( storkRmExe );
  297. }
  298. abortDuplicates = param_boolean( "DAGMAN_ABORT_DUPLICATES",
  299. abortDuplicates );
  300. debug_printf( DEBUG_NORMAL, "DAGMAN_ABORT_DUPLICATES setting: %s\n",
  301. abortDuplicates ? "True" : "False" );
  302. abortOnScarySubmit = param_boolean( "DAGMAN_ABORT_ON_SCARY_SUBMIT",
  303. abortOnScarySubmit );
  304. debug_printf( DEBUG_NORMAL, "DAGMAN_ABORT_ON_SCARY_SUBMIT setting: %s\n",
  305. abortOnScarySubmit ? "True" : "False" );
  306. pendingReportInterval = param_integer( "DAGMAN_PENDING_REPORT_INTERVAL",
  307. pendingReportInterval );
  308. debug_printf( DEBUG_NORMAL, "DAGMAN_PENDING_REPORT_INTERVAL setting: %d\n",
  309. pendingReportInterval );
  310. autoRescue = param_boolean( "DAGMAN_AUTO_RESCUE", autoRescue );
  311. debug_printf( DEBUG_NORMAL, "DAGMAN_AUTO_RESCUE setting: %s\n",
  312. autoRescue ? "True" : "False" );
  313. maxRescueDagNum = param_integer( "DAGMAN_MAX_RESCUE_NUM",
  314. maxRescueDagNum, 0, ABS_MAX_RESCUE_DAG_NUM );
  315. debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_RESCUE_NUM setting: %d\n",
  316. maxRescueDagNum );
  317. free( _defaultNodeLog );
  318. _defaultNodeLog = param( "DAGMAN_DEFAULT_NODE_LOG" );
  319. debug_printf( DEBUG_NORMAL, "DAGMAN_DEFAULT_NODE_LOG setting: %s\n",
  320. _defaultNodeLog ? _defaultNodeLog : "null" );
  321. _generateSubdagSubmits =
  322. param_boolean( "DAGMAN_GENERATE_SUBDAG_SUBMITS",
  323. _generateSubdagSubmits );
  324. debug_printf( DEBUG_NORMAL, "DAGMAN_GENERATE_SUBDAG_SUBMITS setting: %s\n",
  325. _generateSubdagSubmits ? "True" : "False" );
  326. _maxJobHolds = param_integer( "DAGMAN_MAX_JOB_HOLDS", _maxJobHolds,
  327. 0, 1000000 );
  328. char *debugSetting = param( "ALL_DEBUG" );
  329. debug_printf( DEBUG_NORMAL, "ALL_DEBUG setting: %s\n",
  330. debugSetting ? debugSetting : "" );
  331. if ( debugSetting ) {
  332. free( debugSetting );
  333. }
  334. debugSetting = param( "DAGMAN_DEBUG" );
  335. debug_printf( DEBUG_NORMAL, "DAGMAN_DEBUG setting: %s\n",
  336. debugSetting ? debugSetting : "" );
  337. if ( debugSetting ) {
  338. free( debugSetting );
  339. }
  340. // enable up the debug cache if needed
  341. if (debug_cache_enabled) {
  342. debug_cache_set_size(debug_cache_size);
  343. debug_cache_enable();
  344. }
  345. return true;
  346. }
  347. // NOTE: this is only called on reconfig, not at startup
  348. void
  349. main_config()
  350. {
  351. // This is commented out because, even if we get new config
  352. // values here, they don't get passed to the Dag object (which
  353. // is where most of them actually take effect). (See Gnats
  354. // PR 808.) wenger 2007-02-09
  355. // dagman.Config();
  356. }
  357. // this is called by DC when the schedd is shutdown fast
  358. void
  359. main_shutdown_fast()
  360. {
  361. dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_RESTART );
  362. DC_Exit( EXIT_RESTART );
  363. }
  364. // this can be called by other functions, or by DC when the schedd is
  365. // shutdown gracefully
  366. void main_shutdown_graceful() {
  367. dagman.dag->DumpNodeStatus( true, false );
  368. dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_RESTART );
  369. dagman.CleanUp();
  370. DC_Exit( EXIT_RESTART );
  371. }
  372. void main_shutdown_rescue( int exitVal ) {
  373. debug_printf( DEBUG_QUIET, "Aborting DAG...\n" );
  374. if( dagman.dag ) {
  375. // we write the rescue DAG *before* removing jobs because
  376. // otherwise if we crashed, failed, or were killed while
  377. // removing them, we would leave the DAG in an
  378. // unrecoverable state...
  379. if( exitVal != 0 ) {
  380. if( dagman.rescueFileToWrite ) {
  381. debug_printf( DEBUG_NORMAL, "Rescue DAG file %s was specified; "
  382. "overriding automatic rescue DAG naming\n",
  383. dagman.rescueFileToWrite );
  384. dagman.dag->WriteRescue( dagman.rescueFileToWrite,
  385. dagman.primaryDagFile.Value() );
  386. } else if ( dagman.maxRescueDagNum > 0 ) {
  387. dagman.dag->Rescue( dagman.primaryDagFile.Value(),
  388. dagman.multiDags, dagman.maxRescueDagNum );
  389. } else {
  390. debug_printf( DEBUG_QUIET, "No rescue DAG written because "
  391. "DAGMAN_MAX_RESCUE_NUM is 0\n" );
  392. }
  393. }
  394. debug_printf( DEBUG_DEBUG_1, "We have %d running jobs to remove\n",
  395. dagman.dag->NumJobsSubmitted() );
  396. if( dagman.dag->NumJobsSubmitted() > 0 ) {
  397. debug_printf( DEBUG_NORMAL, "Removing submitted jobs...\n" );
  398. dagman.dag->RemoveRunningJobs(dagman);
  399. }
  400. if ( dagman.dag->NumScriptsRunning() > 0 ) {
  401. debug_printf( DEBUG_NORMAL, "Removing running scripts...\n" );
  402. dagman.dag->RemoveRunningScripts();
  403. }
  404. dagman.dag->PrintDeferrals( DEBUG_NORMAL, true );
  405. }
  406. dagman.dag->DumpNodeStatus( false, true );
  407. dagman.dag->GetJobstateLog().WriteDagmanFinished( exitVal );
  408. unlink( lockFileName );
  409. dagman.CleanUp();
  410. DC_Exit( exitVal );
  411. }
  412. // this gets called by DC when DAGMan receives a SIGUSR1 -- which,
  413. // assuming the DAGMan submit file was properly written, is the signal
  414. // the schedd will send if the DAGMan job is removed from the queue
  415. int main_shutdown_remove(Service *, int) {
  416. debug_printf( DEBUG_QUIET, "Received SIGUSR1\n" );
  417. main_shutdown_rescue( EXIT_ABORT );
  418. return FALSE;
  419. }
  420. void ExitSuccess() {
  421. dagman.dag->DumpNodeStatus( false, false );
  422. dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_OKAY );
  423. unlink( lockFileName );
  424. dagman.CleanUp();
  425. DC_Exit( EXIT_OKAY );
  426. }
  427. void condor_event_timer();
  428. /****** FOR TESTING *******
  429. int main_testing_stub( Service *, int ) {
  430. if( dagman.paused ) {
  431. ResumeDag(dagman);
  432. }
  433. else {
  434. PauseDag(dagman);
  435. }
  436. return true;
  437. }
  438. ****** FOR TESTING ********/
  439. //---------------------------------------------------------------------------
  440. void main_init (int argc, char ** const argv) {
  441. printf ("Executing condor dagman ... \n");
  442. // flag used if DAGMan is invoked with -WaitForDebug so we
  443. // wait for a developer to attach with a debugger...
  444. volatile int wait_for_debug = 0;
  445. // process any config vars -- this happens before we process
  446. // argv[], since arguments should override config settings
  447. dagman.Config();
  448. // The DCpermission (last parm) should probably be PARENT, if it existed
  449. daemonCore->Register_Signal( SIGUSR1, "SIGUSR1",
  450. (SignalHandler) main_shutdown_remove,
  451. "main_shutdown_remove", NULL);
  452. /****** FOR TESTING *******
  453. daemonCore->Register_Signal( SIGUSR2, "SIGUSR2",
  454. (SignalHandler) main_testing_stub,
  455. "main_testing_stub", NULL);
  456. ****** FOR TESTING ********/
  457. debug_progname = condor_basename(argv[0]);
  458. // condor_submit_dag version from .condor.sub
  459. bool allowVerMismatch = false;
  460. const char *csdVersion = "undefined";
  461. int i;
  462. for (i = 0 ; i < argc ; i++) {
  463. debug_printf( DEBUG_NORMAL, "argv[%d] == \"%s\"\n", i, argv[i] );
  464. }
  465. if (argc < 2) Usage(); // Make sure an input file was specified
  466. // get dagman job id from environment, if it's there
  467. // (otherwise it will be set to "-1.-1.-1")
  468. dagman.DAGManJobId.SetFromString( getenv( EnvGetName( ENV_ID ) ) );
  469. //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  470. // Minimum legal version for a .condor.sub file to be compatible
  471. // with this condor_dagman binary.
  472. // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  473. // Be sure to change this if the arguments or environment
  474. // passed to condor_dagman change in an incompatible way!!
  475. // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  476. struct DagVersionData {
  477. int majorVer;
  478. int minorVer;
  479. int subMinorVer;
  480. };
  481. const DagVersionData MIN_SUBMIT_FILE_VERSION = { 7, 1, 2 };
  482. // Construct a string of the minimum submit file version.
  483. MyString minSubmitVersionStr;
  484. minSubmitVersionStr.sprintf( "%d.%d.%d",
  485. MIN_SUBMIT_FILE_VERSION.majorVer,
  486. MIN_SUBMIT_FILE_VERSION.minorVer,
  487. MIN_SUBMIT_FILE_VERSION.subMinorVer );
  488. //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  489. //
  490. // Process command-line arguments
  491. //
  492. for (i = 1; i < argc; i++) {
  493. if( !strcasecmp( "-Debug", argv[i] ) ) {
  494. i++;
  495. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  496. debug_printf( DEBUG_SILENT, "No debug level specified\n" );
  497. Usage();
  498. }
  499. debug_level = (debug_level_t) atoi (argv[i]);
  500. } else if( !strcasecmp( "-Lockfile", argv[i] ) ) {
  501. i++;
  502. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  503. debug_printf( DEBUG_SILENT, "No DagMan lockfile specified\n" );
  504. Usage();
  505. }
  506. lockFileName = argv[i];
  507. } else if( !strcasecmp( "-Help", argv[i] ) ) {
  508. Usage();
  509. } else if (!strcasecmp( "-Dag", argv[i] ) ) {
  510. i++;
  511. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  512. debug_printf( DEBUG_SILENT, "No DAG specified\n" );
  513. Usage();
  514. }
  515. dagman.dagFiles.append( argv[i] );
  516. } else if( !strcasecmp( "-Rescue", argv[i] ) ) {
  517. i++;
  518. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  519. debug_printf( DEBUG_SILENT, "No Rescue DAG specified\n" );
  520. Usage();
  521. }
  522. dagman.rescueFileToWrite = argv[i];
  523. } else if( !strcasecmp( "-MaxIdle", argv[i] ) ) {
  524. i++;
  525. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  526. debug_printf( DEBUG_SILENT,
  527. "Integer missing after -MaxIdle\n" );
  528. Usage();
  529. }
  530. dagman.maxIdle = atoi( argv[i] );
  531. } else if( !strcasecmp( "-MaxJobs", argv[i] ) ) {
  532. i++;
  533. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  534. debug_printf( DEBUG_SILENT,
  535. "Integer missing after -MaxJobs\n" );
  536. Usage();
  537. }
  538. dagman.maxJobs = atoi( argv[i] );
  539. } else if( !strcasecmp( "-MaxScripts", argv[i] ) ) {
  540. debug_printf( DEBUG_SILENT, "-MaxScripts has been replaced with "
  541. "-MaxPre and -MaxPost arguments\n" );
  542. Usage();
  543. } else if( !strcasecmp( "-MaxPre", argv[i] ) ) {
  544. i++;
  545. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  546. debug_printf( DEBUG_SILENT,
  547. "Integer missing after -MaxPre\n" );
  548. Usage();
  549. }
  550. dagman.maxPreScripts = atoi( argv[i] );
  551. } else if( !strcasecmp( "-MaxPost", argv[i] ) ) {
  552. i++;
  553. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  554. debug_printf( DEBUG_SILENT,
  555. "Integer missing after -MaxPost\n" );
  556. Usage();
  557. }
  558. dagman.maxPostScripts = atoi( argv[i] );
  559. } else if( !strcasecmp( "-NoEventChecks", argv[i] ) ) {
  560. debug_printf( DEBUG_SILENT, "Warning: -NoEventChecks is "
  561. "ignored; please use the DAGMAN_ALLOW_EVENTS "
  562. "config parameter instead\n");
  563. } else if( !strcasecmp( "-AllowLogError", argv[i] ) ) {
  564. dagman.allowLogError = true;
  565. } else if( !strcasecmp( "-WaitForDebug", argv[i] ) ) {
  566. wait_for_debug = 1;
  567. } else if( !strcasecmp( "-UseDagDir", argv[i] ) ) {
  568. dagman.useDagDir = true;
  569. } else if( !strcasecmp( "-AutoRescue", argv[i] ) ) {
  570. i++;
  571. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  572. debug_printf( DEBUG_SILENT, "No AutoRescue value specified\n" );
  573. Usage();
  574. }
  575. dagman.autoRescue = (atoi( argv[i] ) != 0);
  576. } else if( !strcasecmp( "-DoRescueFrom", argv[i] ) ) {
  577. i++;
  578. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  579. debug_printf( DEBUG_SILENT, "No rescue DAG number specified\n" );
  580. Usage();
  581. }
  582. dagman.doRescueFrom = atoi (argv[i]);
  583. } else if( !strcasecmp( "-CsdVersion", argv[i] ) ) {
  584. i++;
  585. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  586. debug_printf( DEBUG_SILENT, "No CsdVersion value specified\n" );
  587. Usage();
  588. }
  589. csdVersion = argv[i];
  590. } else if( !strcasecmp( "-AllowVersionMismatch", argv[i] ) ) {
  591. allowVerMismatch = true;
  592. } else if( !strcasecmp( "-DumpRescue", argv[i] ) ) {
  593. dagman.dumpRescueDag = true;
  594. } else if( !strcasecmp( "-verbose", argv[i] ) ) {
  595. dagman._submitDagDeepOpts.bVerbose = true;
  596. } else if( !strcasecmp( "-force", argv[i] ) ) {
  597. dagman._submitDagDeepOpts.bForce = true;
  598. } else if( !strcasecmp( "-notification", argv[i] ) ) {
  599. i++;
  600. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  601. debug_printf( DEBUG_SILENT, "No notification value specified\n" );
  602. Usage();
  603. }
  604. dagman._submitDagDeepOpts.strNotification = argv[i];
  605. } else if( !strcasecmp( "-dagman", argv[i] ) ) {
  606. i++;
  607. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  608. debug_printf( DEBUG_SILENT, "No dagman value specified\n" );
  609. Usage();
  610. }
  611. dagman._submitDagDeepOpts.strDagmanPath = argv[i];
  612. } else if( !strcasecmp( "-outfile_dir", argv[i] ) ) {
  613. i++;
  614. if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
  615. debug_printf( DEBUG_SILENT, "No outfile_dir value specified\n" );
  616. Usage();
  617. }
  618. dagman._submitDagDeepOpts.strOutfileDir = argv[i];
  619. } else if( !strcasecmp( "-update_submit", argv[i] ) ) {
  620. dagman._submitDagDeepOpts.updateSubmit = true;
  621. } else if( !strcasecmp( "-import_env", argv[i] ) ) {
  622. dagman._submitDagDeepOpts.importEnv = true;
  623. } else {
  624. debug_printf( DEBUG_SILENT, "\nUnrecognized argument: %s\n",
  625. argv[i] );
  626. Usage();
  627. }
  628. }
  629. dagman.dagFiles.rewind();
  630. dagman.primaryDagFile = dagman.dagFiles.next();
  631. dagman.multiDags = (dagman.dagFiles.number() > 1);
  632. MyString tmpDefaultLog;
  633. if ( dagman._defaultNodeLog != NULL ) {
  634. tmpDefaultLog = dagman._defaultNodeLog;
  635. free( dagman._defaultNodeLog );
  636. } else {
  637. tmpDefaultLog = dagman.primaryDagFile + ".nodes.log";
  638. }
  639. // Force default log file path to be absolute so it works
  640. // with -usedagdir and DIR nodes.
  641. CondorError errstack;
  642. if ( !MultiLogFiles::makePathAbsolute( tmpDefaultLog, errstack) ) {
  643. debug_printf( DEBUG_QUIET, "Unable to convert default log "
  644. "file name to absolute path: %s\n",
  645. errstack.getFullText() );
  646. dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_ERROR );
  647. DC_Exit( EXIT_ERROR );
  648. }
  649. dagman._defaultNodeLog = strdup( tmpDefaultLog.Value() );
  650. debug_printf( DEBUG_NORMAL, "Default node log file is: <%s>\n",
  651. dagman._defaultNodeLog);
  652. //
  653. // Check the arguments
  654. //
  655. //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  656. // Checking for version compatibility between the .condor.sub
  657. // file and this condor_dagman binary...
  658. // Note: if we're in recovery mode and the submit file version
  659. // causes us to quit, we leave any existing node jobs still
  660. // running -- may want to change that eventually. wenger 2009-10-13.
  661. // Version of the condor_submit_dag that created our submit file.
  662. CondorVersionInfo submitFileVersion( csdVersion );
  663. // Version of this condor_dagman binary.
  664. CondorVersionInfo dagmanVersion;
  665. // Just generate this message fragment in one place.
  666. MyString versionMsg;
  667. versionMsg.sprintf("the version (%s) of this DAG's Condor submit "
  668. "file (created by condor_submit_dag)", csdVersion );
  669. // Make sure version in submit file is valid.
  670. if( !submitFileVersion.is_valid() ) {
  671. if ( !allowVerMismatch ) {
  672. debug_printf( DEBUG_QUIET, "Error: %s is invalid!\n",
  673. versionMsg.Value() );
  674. DC_Exit( EXIT_ERROR );
  675. } else {
  676. debug_printf( DEBUG_NORMAL, "Warning: %s is invalid; "
  677. "continuing because of -AllowVersionMismatch flag\n",
  678. versionMsg.Value() );
  679. }
  680. // Make sure .condor.sub file is recent enough.
  681. } else if ( submitFileVersion.compare_versions(
  682. CondorVersion() ) != 0 ) {
  683. if( !submitFileVersion.built_since_version(
  684. MIN_SUBMIT_FILE_VERSION.majorVer,
  685. MIN_SUBMIT_FILE_VERSION.minorVer,
  686. MIN_SUBMIT_FILE_VERSION.subMinorVer ) ) {
  687. if ( !allowVerMismatch ) {
  688. debug_printf( DEBUG_QUIET, "Error: %s is older than "
  689. "oldest permissible version (%s)\n",
  690. versionMsg.Value(), minSubmitVersionStr.Value() );
  691. DC_Exit( EXIT_ERROR );
  692. } else {
  693. debug_printf( DEBUG_NORMAL, "Warning: %s is older than "
  694. "oldest permissible version (%s); continuing "
  695. "because of -AllowVersionMismatch flag\n",
  696. versionMsg.Value(), minSubmitVersionStr.Value() );
  697. }
  698. // Warn if .condor.sub file is a newer version than this binary.
  699. } else if (dagmanVersion.compare_versions( csdVersion ) > 0 ) {
  700. debug_printf( DEBUG_NORMAL, "Warning: %s is newer than "
  701. "condor_dagman version (%s)\n", versionMsg.Value(),
  702. CondorVersion() );
  703. } else {
  704. debug_printf( DEBUG_NORMAL, "Note: %s differs from "
  705. "condor_dagman version (%s), but the "
  706. "difference is permissible\n",
  707. versionMsg.Value(), CondorVersion() );
  708. }
  709. }
  710. //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  711. if( dagman.primaryDagFile == "" ) {
  712. debug_printf( DEBUG_SILENT, "No DAG file was specified\n" );
  713. Usage();
  714. }
  715. if (lockFileName == NULL) {
  716. debug_printf( DEBUG_SILENT, "No DAG lock file was specified\n" );
  717. Usage();
  718. }
  719. if( dagman.maxJobs < 0 ) {
  720. debug_printf( DEBUG_SILENT, "-MaxJobs must be non-negative\n");
  721. Usage();
  722. }
  723. if( dagman.maxPreScripts < 0 ) {
  724. debug_printf( DEBUG_SILENT, "-MaxPre must be non-negative\n" );
  725. Usage();
  726. }
  727. if( dagman.maxPostScripts < 0 ) {
  728. debug_printf( DEBUG_SILENT, "-MaxPost must be non-negative\n" );
  729. Usage();
  730. }
  731. if( dagman.doRescueFrom < 0 ) {
  732. debug_printf( DEBUG_SILENT, "-DoRescueFrom must be non-negative\n" );
  733. Usage();
  734. }
  735. if (dagman.rescueFileToWrite && dagman.autoRescue) {
  736. debug_printf( DEBUG_QUIET, "Error: old-style rescue DAG specified "
  737. "and DAGMAN_AUTO_RESCUE is true\n" );
  738. DC_Exit( EXIT_ERROR );
  739. }
  740. debug_printf( DEBUG_VERBOSE, "DAG Lockfile will be written to %s\n",
  741. lockFileName );
  742. if ( dagman.dagFiles.number() == 1 ) {
  743. debug_printf( DEBUG_VERBOSE, "DAG Input file is %s\n",
  744. dagman.primaryDagFile.Value() );
  745. } else {
  746. MyString msg = "DAG Input files are ";
  747. dagman.dagFiles.rewind();
  748. const char *dagFile;
  749. while ( (dagFile = dagman.dagFiles.next()) != NULL ) {
  750. msg += dagFile;
  751. msg += " ";
  752. }
  753. msg += "\n";
  754. debug_printf( DEBUG_VERBOSE, "%s", msg.Value() );
  755. }
  756. if ( dagman.rescueFileToWrite ) {
  757. debug_printf( DEBUG_VERBOSE, "Rescue DAG will be written to %s\n",
  758. dagman.rescueFileToWrite );
  759. }
  760. // if requested, wait for someone to attach with a debugger...
  761. while( wait_for_debug );
  762. {
  763. MyString cwd;
  764. if( !condor_getcwd(cwd) ) {
  765. cwd = "<null>";
  766. }
  767. debug_printf( DEBUG_DEBUG_1, "Current path is %s\n",cwd.Value());
  768. char *temp = my_username();
  769. debug_printf( DEBUG_DEBUG_1, "Current user is %s\n",
  770. temp ? temp : "<null>" );
  771. if( temp ) {
  772. free( temp );
  773. }
  774. }
  775. //
  776. // Figure out the rescue DAG to run, if any (this is with "new-
  777. // style" rescue DAGs.
  778. //
  779. int rescueDagNum = 0;
  780. MyString rescueDagMsg;
  781. if ( dagman.doRescueFrom != 0 ) {
  782. rescueDagNum = dagman.doRescueFrom;
  783. rescueDagMsg.sprintf( "Rescue DAG number %d specified", rescueDagNum );
  784. RenameRescueDagsAfter( dagman.primaryDagFile.Value(),
  785. dagman.multiDags, rescueDagNum, dagman.maxRescueDagNum );
  786. } else if ( dagman.autoRescue ) {
  787. rescueDagNum = FindLastRescueDagNum(
  788. dagman.primaryDagFile.Value(),
  789. dagman.multiDags, dagman.maxRescueDagNum );
  790. rescueDagMsg.sprintf( "Found rescue DAG number %d", rescueDagNum );
  791. }
  792. //
  793. // If we are running a "new-style" rescue DAG, update our DAG
  794. // files list accordingly.
  795. //
  796. if ( rescueDagNum > 0 ) {
  797. dagman.rescueFileToRun = RescueDagName(
  798. dagman.primaryDagFile.Value(),
  799. dagman.multiDags, rescueDagNum );
  800. debug_printf ( DEBUG_QUIET, "%s; running %s instead of normal "
  801. "DAG file%s\n", rescueDagMsg.Value(),
  802. dagman.rescueFileToRun.Value(),
  803. dagman.multiDags ? "s" : "");
  804. debug_printf ( DEBUG_QUIET,
  805. "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
  806. debug_printf ( DEBUG_QUIET, "RUNNING RESCUE DAG %s\n",
  807. dagman.rescueFileToRun.Value() );
  808. // Note: if we ran multiple DAGs and they failed, the
  809. // whole thing is condensed into a single rescue DAG.
  810. // wenger 2007-02-27
  811. dagman.dagFiles.clearAll();
  812. dagman.dagFiles.append( dagman.rescueFileToRun.Value() );
  813. dagman.dagFiles.rewind();
  814. if ( dagman.useDagDir ) {
  815. debug_printf ( DEBUG_NORMAL,
  816. "Unsetting -useDagDir flag because we're running "
  817. "a rescue DAG\n" );
  818. dagman.useDagDir = false;
  819. }
  820. }
  821. //
  822. // Fill in values in the deep submit options that we haven't
  823. // already set.
  824. //
  825. dagman._submitDagDeepOpts.bAllowLogError = dagman.allowLogError;
  826. dagman._submitDagDeepOpts.useDagDir = dagman.useDagDir;
  827. dagman._submitDagDeepOpts.oldRescue =
  828. (dagman.rescueFileToWrite != NULL);
  829. dagman._submitDagDeepOpts.autoRescue = dagman.autoRescue;
  830. dagman._submitDagDeepOpts.doRescueFrom = dagman.doRescueFrom;
  831. dagman._submitDagDeepOpts.allowVerMismatch = allowVerMismatch;
  832. dagman._submitDagDeepOpts.recurse = false;
  833. //
  834. // Create the DAG
  835. //
  836. // Note: a bunch of the parameters we pass here duplicate things
  837. // in submitDagOpts, but I'm keeping them separate so we don't have to
  838. // bother to construct a new SubmitDagOtions object for splices.
  839. // wenger 2010-03-25
  840. dagman.dag = new Dag( dagman.dagFiles, dagman.maxJobs,
  841. dagman.maxPreScripts, dagman.maxPostScripts,
  842. dagman.allowLogError, dagman.useDagDir,
  843. dagman.maxIdle, dagman.retrySubmitFirst,
  844. dagman.retryNodeFirst, dagman.condorRmExe,
  845. dagman.storkRmExe, &dagman.DAGManJobId,
  846. dagman.prohibitMultiJobs, dagman.submitDepthFirst,
  847. dagman._defaultNodeLog,
  848. dagman._generateSubdagSubmits,
  849. &dagman._submitDagDeepOpts,
  850. false ); /* toplevel dag! */
  851. if( dagman.dag == NULL ) {
  852. EXCEPT( "ERROR: out of memory!\n");
  853. }
  854. dagman.dag->SetAbortOnScarySubmit( dagman.abortOnScarySubmit );
  855. dagman.dag->SetAllowEvents( dagman.allow_events );
  856. dagman.dag->SetConfigFile( dagman._dagmanConfigFile );
  857. dagman.dag->SetMaxJobHolds( dagman._maxJobHolds );
  858. //
  859. // Parse the input files. The parse() routine
  860. // takes care of adding jobs and dependencies to the DagMan
  861. //
  862. if ( dagman.dagFiles.number() < 2 ) dagman.mungeNodeNames = false;
  863. parseSetDoNameMunge( dagman.mungeNodeNames );
  864. debug_printf( DEBUG_VERBOSE, "Parsing %d dagfiles\n",
  865. dagman.dagFiles.number() );
  866. dagman.dagFiles.rewind();
  867. char *dagFile;
  868. // Here we make a copy of the dagFiles for iteration purposes. Deep inside
  869. // of the parsing, copies of the dagman.dagFile string list happen which
  870. // mess up the iteration of this list.
  871. char *str = dagman.dagFiles.print_to_delimed_string();
  872. StringList sl(str);
  873. free(str);
  874. sl.rewind();
  875. while ( (dagFile = sl.next()) != NULL ) {
  876. debug_printf( DEBUG_VERBOSE, "Parsing %s ...\n", dagFile );
  877. if( !parse( dagman.dag, dagFile, dagman.useDagDir ) ) {
  878. if ( dagman.dumpRescueDag ) {
  879. // Dump the rescue DAG so we can see what we got
  880. // in the failed parse attempt.
  881. debug_printf( DEBUG_QUIET, "Dumping rescue DAG "
  882. "because of -DumpRescue flag\n" );
  883. dagman.dag->Rescue( dagman.primaryDagFile.Value(),
  884. dagman.multiDags, dagman.maxRescueDagNum,
  885. true );
  886. }
  887. // Note: debug_error calls DC_Exit().
  888. debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n",
  889. dagFile );
  890. }
  891. }
  892. dagman.dag->GetJobstateLog().WriteDagmanStarted( dagman.DAGManJobId );
  893. if ( rescueDagNum > 0 ) {
  894. // Get our Pegasus sequence numbers set correctly.
  895. dagman.dag->GetJobstateLog().InitializeRescue();
  896. }
  897. // lift the final set of splices into the main dag.
  898. dagman.dag->LiftSplices(SELF);
  899. dagman.dag->CheckThrottleCats();
  900. // fix up any use of $(JOB) in the vars values for any node
  901. dagman.dag->ResolveVarsInterpolations();
  902. /* debug_printf(DEBUG_QUIET, "COMPLETED DAG!\n");*/
  903. /* dagman.dag->PrintJobList();*/
  904. #ifndef NOT_DETECT_CYCLE
  905. if( dagman.startup_cycle_detect && dagman.dag->isCycle() )
  906. {
  907. debug_error (1, DEBUG_QUIET, "ERROR: a cycle exists in the dag, plese check input\n");
  908. }
  909. #endif
  910. debug_printf( DEBUG_VERBOSE, "Dag contains %d total jobs\n",
  911. dagman.dag->NumNodes() );
  912. MyString firstLocation;
  913. if ( dagman.dag->GetReject( firstLocation ) ) {
  914. debug_printf( DEBUG_QUIET, "Exiting because of REJECT "
  915. "specification in %s. This most likely means "
  916. "that the DAG file was produced with the -DumpRescue "
  917. "flag when parsing the original DAG failed.\n",
  918. firstLocation.Value() );
  919. DC_Exit( EXIT_ERROR );
  920. return;
  921. }
  922. dagman.dag->DumpDotFile();
  923. if ( dagman.dumpRescueDag ) {
  924. debug_printf( DEBUG_QUIET, "Dumping rescue DAG and exiting "
  925. "because of -DumpRescue flag\n" );
  926. dagman.dag->Rescue( dagman.primaryDagFile.Value(),
  927. dagman.multiDags, dagman.maxRescueDagNum );
  928. ExitSuccess();
  929. return;
  930. }
  931. //------------------------------------------------------------------------
  932. // Bootstrap and Recovery
  933. //
  934. // If the Lockfile exists, this indicates a premature termination
  935. // of a previous run of Dagman. If condor log is also present,
  936. // we run in recovery mode
  937. // If the Daglog is not present, then we do not run in recovery
  938. // mode
  939. {
  940. bool recovery = access(lockFileName, F_OK) == 0;
  941. if (recovery) {
  942. debug_printf( DEBUG_VERBOSE, "Lock file %s detected, \n",
  943. lockFileName);
  944. if (dagman.abortDuplicates) {
  945. if (util_check_lock_file(lockFileName) == 1) {
  946. debug_printf( DEBUG_QUIET, "Aborting because it "
  947. "looks like another instance of DAGMan is "
  948. "currently running on this DAG; if that is "
  949. "not the case, delete the lock file (%s) "
  950. "and re-submit the DAG.\n", lockFileName );
  951. dagman.dag->GetJobstateLog().
  952. WriteDagmanFinished( EXIT_RESTART );
  953. dagman.CleanUp();
  954. DC_Exit( EXIT_ERROR );
  955. // We should never get to here!
  956. }
  957. }
  958. }
  959. //
  960. // If this DAGMan continues, it should overwrite the lock
  961. // file if it exists.
  962. //
  963. util_create_lock_file(lockFileName, dagman.abortDuplicates);
  964. debug_printf( DEBUG_VERBOSE, "Bootstrapping...\n");
  965. if( !dagman.dag->Bootstrap( recovery ) ) {
  966. dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 );
  967. debug_error( 1, DEBUG_QUIET, "ERROR while bootstrapping\n");
  968. }
  969. }
  970. debug_printf( DEBUG_VERBOSE, "Registering condor_event_timer...\n" );
  971. daemonCore->Register_Timer( 1, dagman.m_user_log_scan_interval,
  972. condor_event_timer, "condor_event_timer" );
  973. dagman.dag->SetPendingNodeReportInterval(
  974. dagman.pendingReportInterval );
  975. }
  976. void
  977. print_status() {
  978. int total = dagman.dag->NumNodes();
  979. int done = dagman.dag->NumNodesDone();
  980. int pre = dagman.dag->PreRunNodeCount();
  981. int submitted = dagman.dag->NumJobsSubmitted();
  982. int post = dagman.dag->PostRunNodeCount();
  983. int ready = dagman.dag->NumNodesReady();
  984. int failed = dagman.dag->NumNodesFailed();
  985. int unready = total - (done + pre + submitted + post + ready + failed );
  986. debug_printf( DEBUG_VERBOSE, "Of %d nodes total:\n", total );
  987. debug_printf( DEBUG_VERBOSE, " Done Pre Queued Post Ready Un-Ready Failed\n" );
  988. debug_printf( DEBUG_VERBOSE, " === === === === === === ===\n" );
  989. debug_printf( DEBUG_VERBOSE, "%5d %5d %5d %5d %5d %5d %5d\n",
  990. done, pre, submitted, post, ready, unready, failed );
  991. debug_printf( DEBUG_VERBOSE, "%d job proc(s) currently held\n",
  992. dagman.dag->NumHeldJobProcs() );
  993. dagman.dag->PrintDeferrals( DEBUG_VERBOSE, false );
  994. }
  995. void condor_event_timer () {
  996. ASSERT( dagman.dag != NULL );
  997. //------------------------------------------------------------------------
  998. // Proceed with normal operation
  999. //
  1000. // At this point, the DAG is bootstrapped. All jobs premarked DONE
  1001. // are in a STATUS_DONE state, and all their children have been
  1002. // marked ready to submit.
  1003. //
  1004. // If recovery was needed, the log file has been completely read and
  1005. // we are ready to proceed with jobs yet unsubmitted.
  1006. //------------------------------------------------------------------------
  1007. if( dagman.paused == true ) {
  1008. debug_printf( DEBUG_DEBUG_1, "(DAGMan paused)\n" );
  1009. return;
  1010. }
  1011. static int prevJobsDone = 0;
  1012. static int prevJobs = 0;
  1013. static int prevJobsFailed = 0;
  1014. static int prevJobsSubmitted = 0;
  1015. static int prevJobsReady = 0;
  1016. static int prevScriptRunNodes = 0;
  1017. static int prevJobsHeld = 0;
  1018. int justSubmitted;
  1019. justSubmitted = dagman.dag->SubmitReadyJobs(dagman);
  1020. if( justSubmitted ) {
  1021. // Note: it would be nice to also have the proc submit
  1022. // count here. wenger, 2006-02-08.
  1023. debug_printf( DEBUG_VERBOSE, "Just submitted %d job%s this cycle...\n",
  1024. justSubmitted, justSubmitted == 1 ? "" : "s" );
  1025. }
  1026. // If the log has grown
  1027. if( dagman.dag->DetectCondorLogGrowth() ) {
  1028. if( dagman.dag->ProcessLogEvents( CONDORLOG ) == false ) {
  1029. dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 );
  1030. main_shutdown_rescue( EXIT_ERROR );
  1031. return;
  1032. }
  1033. }
  1034. if( dagman.dag->DetectDaPLogGrowth() ) {
  1035. if( dagman.dag->ProcessLogEvents( DAPLOG ) == false ) {
  1036. debug_printf( DEBUG_NORMAL,
  1037. "ProcessLogEvents(DAPLOG) returned false\n");
  1038. dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 );
  1039. main_shutdown_rescue( EXIT_ERROR );
  1040. return;
  1041. }
  1042. }
  1043. // print status if anything's changed (or we're in a high debug level)
  1044. if( prevJobsDone != dagman.dag->NumNodesDone()
  1045. || prevJobs != dagman.dag->NumNodes()
  1046. || prevJobsFailed != dagman.dag->NumNodesFailed()
  1047. || prevJobsSubmitted != dagman.dag->NumJobsSubmitted()
  1048. || prevJobsReady != dagman.dag->NumNodesReady()
  1049. || prevScriptRunNodes != dagman.dag->ScriptRunNodeCount()
  1050. || prevJobsHeld != dagman.dag->NumHeldJobProcs()
  1051. || DEBUG_LEVEL( DEBUG_DEBUG_4 ) ) {
  1052. print_status();
  1053. prevJobsDone = dagman.dag->NumNodesDone();
  1054. prevJobs = dagman.dag->NumNodes();
  1055. prevJobsFailed = dagman.dag->NumNodesFailed();
  1056. prevJobsSubmitted = dagman.dag->NumJobsSubmitted();
  1057. prevJobsReady = dagman.dag->NumNodesReady();
  1058. prevScriptRunNodes = dagman.dag->ScriptRunNodeCount();
  1059. prevJobsHeld = dagman.dag->NumHeldJobProcs();
  1060. if( dagman.dag->GetDotFileUpdate() ) {
  1061. dagman.dag->DumpDotFile();
  1062. }
  1063. }
  1064. dagman.dag->DumpNodeStatus( false, false );
  1065. ASSERT( dagman.dag->NumNodesDone() + dagman.dag->NumNodesFailed()
  1066. <= dagman.dag->NumNodes() );
  1067. //
  1068. // If DAG is complete, hurray, and exit.
  1069. //
  1070. if( dagman.dag->DoneSuccess() ) {
  1071. ASSERT( dagman.dag->NumJobsSubmitted() == 0 );
  1072. dagman.dag->CheckAllJobs();
  1073. debug_printf( DEBUG_NORMAL, "All jobs Completed!\n" );
  1074. dagman.dag->PrintDeferrals( DEBUG_NORMAL, true );
  1075. if ( dagman.dag->NumIdleJobProcs() != 0 ) {
  1076. debug_printf( DEBUG_NORMAL, "Warning: DAGMan thinks there "
  1077. "are %d idle jobs, even though the DAG is "
  1078. "completed!\n", dagman.dag->NumIdleJobProcs() );
  1079. }
  1080. ExitSuccess();
  1081. return;
  1082. }
  1083. //
  1084. // If no jobs are submitted and no scripts are running, but the
  1085. // dag is not complete, then at least one job failed, or a cycle
  1086. // exists.
  1087. //
  1088. if( dagman.dag->FinishedRunning() ) {
  1089. if( dagman.dag->DoneFailed() ) {
  1090. if( DEBUG_LEVEL( DEBUG_QUIET ) ) {
  1091. debug_printf( DEBUG_QUIET,
  1092. "ERROR: the following job(s) failed:\n" );
  1093. dagman.dag->PrintJobList( Job::STATUS_ERROR );
  1094. }
  1095. } else {
  1096. // no jobs failed, so a cycle must exist
  1097. debug_printf( DEBUG_QUIET, "ERROR: DAG finished but not all "
  1098. "nodes are complete -- checking for a cycle...\n" );
  1099. if( dagman.dag->isCycle() ) {
  1100. debug_printf (DEBUG_QUIET, "... ERROR: a cycle exists "
  1101. "in the dag, plese check input\n");
  1102. } else {
  1103. debug_printf (DEBUG_QUIET, "... ERROR: no cycle found; "
  1104. "unknown error condition\n");
  1105. }
  1106. if ( debug_level >= DEBUG_NORMAL ) {
  1107. dagman.dag->PrintJobList();
  1108. }
  1109. }
  1110. main_shutdown_rescue( EXIT_ERROR );
  1111. return;
  1112. }
  1113. }
  1114. void
  1115. main_pre_dc_init( int, char*[] )
  1116. {
  1117. DC_Skip_Auth_Init();
  1118. DC_Skip_Core_Init();
  1119. // Convert the DAGMan log file name to an absolute path if it's
  1120. // not one already, so that we'll log things to the right file
  1121. // if we change to a different directory.
  1122. const char * logFile = GetEnv( "_CONDOR_DAGMAN_LOG" );
  1123. if ( logFile && !fullpath( logFile ) ) {
  1124. MyString currentDir;
  1125. if ( condor_getcwd( currentDir ) ) {
  1126. MyString newLogFile(currentDir);
  1127. newLogFile += DIR_DELIM_STRING;
  1128. newLogFile += logFile;
  1129. SetEnv( "_CONDOR_DAGMAN_LOG", newLogFile.Value() );
  1130. } else {
  1131. debug_printf( DEBUG_NORMAL, "ERROR: unable to get cwd: %d, %s\n",
  1132. errno, strerror(errno) );
  1133. }
  1134. }
  1135. }
  1136. void
  1137. main_pre_command_sock_init( )
  1138. {
  1139. }