PageRenderTime 50ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/condor-7.9.0/src/condor_starter.V6.1/tool_daemon_proc.cpp

#
C++ | 477 lines | 311 code | 76 blank | 90 comment | 65 complexity | 332c77fc1b1aaa1bee7c56e5efa976f3 MD5 | raw file
Possible License(s): Apache-2.0
  1. /***************************************************************
  2. *
  3. * Copyright (C) 1990-2007, Condor Team, Computer Sciences Department,
  4. * University of Wisconsin-Madison, WI.
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the "License"); you
  7. * may not use this file except in compliance with the License. You may
  8. * obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. *
  18. ***************************************************************/
  19. #include "condor_common.h"
  20. #include "condor_classad.h"
  21. #include "condor_config.h"
  22. #include "condor_debug.h"
  23. #include "env.h"
  24. #include "user_proc.h"
  25. #include "tool_daemon_proc.h"
  26. #include "starter.h"
  27. #include "condor_daemon_core.h"
  28. #include "condor_attributes.h"
  29. #include "condor_uid.h"
  30. #include "condor_distribution.h"
  31. #include "basename.h"
  32. #ifdef WIN32
  33. #include "perm.h"
  34. #endif
  35. extern CStarter *Starter;
  36. /* ToolDaemonProc class implementation */
  37. ToolDaemonProc::ToolDaemonProc( ClassAd *jobAd, int application_pid )
  38. {
  39. dprintf( D_FULLDEBUG, "In ToolDaemonProc::ToolDaemonProc()\n" );
  40. JobAd = jobAd;
  41. job_suspended = false;
  42. ApplicationPid = application_pid;
  43. }
  44. int
  45. ToolDaemonProc::StartJob()
  46. {
  47. int i;
  48. int nice_inc = 0;
  49. dprintf( D_FULLDEBUG, "in ToolDaemonProc::StartJob()\n" );
  50. if( !JobAd ) {
  51. dprintf( D_ALWAYS, "No JobAd in ToolDaemonProc::StartJob()!\n" );
  52. return 0;
  53. }
  54. MyString DaemonNameStr;
  55. char* tmp = NULL;
  56. if( JobAd->LookupString( ATTR_TOOL_DAEMON_CMD, &tmp ) != 1 ) {
  57. dprintf( D_ALWAYS, "%s not found in JobAd. Aborting "
  58. "ToolDaemonProc::StartJob()\n", ATTR_TOOL_DAEMON_CMD );
  59. return 0;
  60. }
  61. const char* job_iwd = Starter->jic->jobIWD();
  62. dprintf( D_ALWAYS, "IWD: %s\n", job_iwd );
  63. const char* base = NULL;
  64. base = condor_basename( tmp );
  65. if( Starter->jic->iwdIsChanged() ) {
  66. DaemonNameStr.sprintf( "%s%c%s", Starter->GetWorkingDir(),
  67. DIR_DELIM_CHAR, base );
  68. } else if( ! fullpath(tmp) ) {
  69. DaemonNameStr.sprintf( "%s%c%s", job_iwd, DIR_DELIM_CHAR, tmp );
  70. } else {
  71. DaemonNameStr = tmp;
  72. }
  73. const char* DaemonName = DaemonNameStr.Value();
  74. free( tmp );
  75. tmp = NULL;
  76. dprintf( D_FULLDEBUG, "Daemon Name: %s \n", DaemonName );
  77. // This is something of an ugly hack. filetransfer doesn't
  78. // preserve file permissions when it moves a file. so, our
  79. // tool "binary" (or script, whatever it is), is sitting in
  80. // the starter's directory without an execute bit set. So,
  81. // we've got to call chmod() so that exec() doesn't fail.
  82. if( Starter->jic->iwdIsChanged() ) {
  83. priv_state old_priv = set_user_priv();
  84. int retval = chmod( DaemonName, S_IRWXU | S_IRWXO | S_IRWXG );
  85. set_priv( old_priv );
  86. if( retval < 0 ) {
  87. dprintf( D_ALWAYS, "Failed to chmod %s!\n", DaemonName );
  88. return 0;
  89. }
  90. }
  91. // compute job's renice value by evaluating the machine's
  92. // JOB_RENICE_INCREMENT in the context of the job ad...
  93. char* ptmp = param( "JOB_RENICE_INCREMENT" );
  94. if( ptmp ) {
  95. // insert renice expr into our copy of the job ad
  96. MyString reniceAttr = "Renice = ";
  97. reniceAttr += ptmp;
  98. if( !JobAd->Insert( reniceAttr.Value() ) ) {
  99. dprintf( D_ALWAYS, "ERROR: failed to insert JOB_RENICE_INCREMENT "
  100. "into job ad, Aborting ToolDaemonProc::StartJob...\n" );
  101. free( ptmp );
  102. return 0;
  103. }
  104. // evaluate
  105. if( JobAd->EvalInteger( "Renice", NULL, nice_inc ) ) {
  106. dprintf( D_ALWAYS, "Renice expr \"%s\" evaluated to %d\n",
  107. ptmp, nice_inc );
  108. } else {
  109. dprintf( D_ALWAYS, "WARNING: job renice expr (\"%s\") doesn't "
  110. "eval to int! Using default of 10...\n", ptmp );
  111. nice_inc = 10;
  112. }
  113. // enforce valid ranges for nice_inc
  114. if( nice_inc < 0 ) {
  115. dprintf( D_FULLDEBUG, "WARNING: job renice value (%d) is too "
  116. "low: adjusted to 0\n", nice_inc );
  117. nice_inc = 0;
  118. }
  119. else if( nice_inc > 19 ) {
  120. dprintf( D_FULLDEBUG, "WARNING: job renice value (%d) is too "
  121. "high: adjusted to 19\n", nice_inc );
  122. nice_inc = 19;
  123. }
  124. ASSERT( ptmp );
  125. free( ptmp );
  126. ptmp = NULL;
  127. } else {
  128. // if JOB_RENICE_INCREMENT is undefined, default to 10
  129. nice_inc = 10;
  130. }
  131. // // // // // //
  132. // Arguments
  133. // // // // // //
  134. ArgList DaemonArgs;
  135. ASSERT( tmp == NULL );
  136. DaemonArgs.AppendArg(DaemonName);
  137. JobAd->LookupString( ATTR_TOOL_DAEMON_ARGS2, &tmp );
  138. bool args_success = true;
  139. MyString args_error;
  140. if( tmp ) {
  141. args_success = DaemonArgs.AppendArgsV2Raw(tmp,&args_error);
  142. dprintf( D_FULLDEBUG, "Daemon Args: %s\n", tmp ) ;
  143. free( tmp );
  144. tmp = NULL;
  145. }
  146. else {
  147. JobAd->LookupString( ATTR_TOOL_DAEMON_ARGS1, &tmp );
  148. if( tmp ) {
  149. args_success = DaemonArgs.AppendArgsV1Raw(tmp,&args_error);
  150. dprintf( D_FULLDEBUG, "Daemon Args: %s\n", tmp ) ;
  151. free( tmp );
  152. tmp = NULL;
  153. }
  154. }
  155. if(!args_success) {
  156. dprintf(D_ALWAYS, "Aborting. Failed to read daemon args: %s\n",
  157. args_error.Value());
  158. return 0;
  159. }
  160. // // // // // //
  161. // Environment
  162. // // // // // //
  163. Env job_env;
  164. MyString env_errors;
  165. if( !job_env.MergeFrom(JobAd,&env_errors) ) {
  166. dprintf( D_ALWAYS, "Failed to read environment from JobAd. Aborting "
  167. "ToolDaemonProc::StartJob: %s\n",env_errors.Value());
  168. return 0;
  169. }
  170. // for now, we pass "ENV" as the address of the LASS
  171. // this tells the tool that the job PID will be placed
  172. // in the environment variable, "TDP_AP_PID"
  173. job_env.SetEnv("TDP_LASS_ADDRESS", "ENV");
  174. char pid_buf[256];
  175. sprintf(pid_buf, "%d", ApplicationPid);
  176. job_env.SetEnv("TDP_AP_PID", pid_buf);
  177. // Now, let the starter publish any env vars it wants to into
  178. // the mainjob's env...
  179. Starter->PublishToEnv( &job_env );
  180. // // // // // //
  181. // Standard Files
  182. // // // // // //
  183. // handle stdin, stdout, and stderr redirection
  184. int fds[3];
  185. // initialize these to -2 to mean they're not specified.
  186. // -1 will be treated as an error.
  187. fds[0] = -2; fds[1] = -2; fds[2] = -2;
  188. // in order to open these files we must have the user's privs:
  189. priv_state priv;
  190. priv = set_user_priv();
  191. fds[0] = openStdFile( SFT_IN,
  192. ATTR_TOOL_DAEMON_INPUT,
  193. false,
  194. "Tool Daemon Input file");
  195. fds[1] = openStdFile( SFT_OUT,
  196. ATTR_TOOL_DAEMON_OUTPUT,
  197. false,
  198. "Tool Daemon Output file");
  199. fds[2] = openStdFile( SFT_ERR,
  200. ATTR_TOOL_DAEMON_ERROR,
  201. false,
  202. "Tool Daemon Error file");
  203. /* Bail out if we couldn't open the std files correctly */
  204. if( fds[0] == -1 || fds[1] == -1 || fds[2] == -1 ) {
  205. /* only close ones that had been opened correctly */
  206. for( int fdindex=0; fdindex<=2; fdindex++ ) {
  207. if( fds[fdindex] >= 0 ) {
  208. close(fds[fdindex]);
  209. }
  210. }
  211. dprintf(D_ALWAYS, "Failed to open some/all of the std files...\n");
  212. dprintf(D_ALWAYS, "Aborting ToolDaemonProc::StartJob.\n");
  213. set_priv(priv); /* go back to original priv state before leaving */
  214. return 0;
  215. }
  216. // // // // // //
  217. // Misc + Exec
  218. // // // // // //
  219. // set up the FamilyInfo structure we will be using to track the tool
  220. // daemon's process family
  221. //
  222. FamilyInfo fi;
  223. fi.max_snapshot_interval = 15;
  224. char const *dedicated_account = NULL;
  225. if (job_universe != CONDOR_UNIVERSE_LOCAL) {
  226. dedicated_account = Starter->jic->getExecuteAccountIsDedicated();
  227. }
  228. if (dedicated_account) {
  229. fi.login = dedicated_account;
  230. dprintf(D_FULLDEBUG,
  231. "Tracking process family by login \"%s\"\n",
  232. fi.login);
  233. }
  234. MyString args_string;
  235. DaemonArgs.GetArgsStringForDisplay(&args_string);
  236. dprintf( D_ALWAYS, "About to exec %s\n", args_string.Value() );
  237. set_priv( priv );
  238. JobPid = daemonCore->Create_Process( DaemonName,
  239. DaemonArgs,
  240. PRIV_USER_FINAL,
  241. 1,
  242. FALSE,
  243. &job_env,
  244. job_iwd,
  245. &fi,
  246. NULL,
  247. fds,
  248. NULL,
  249. nice_inc,
  250. NULL,
  251. DCJOBOPT_NO_ENV_INHERIT );
  252. //NOTE: Create_Process() saves the errno for us if it is an
  253. //"interesting" error.
  254. char const *create_process_error = NULL;
  255. int create_process_errno = errno;
  256. if(JobPid == FALSE && errno) create_process_error = strerror(errno);
  257. // now close the descriptors in daemon_fds array. our child has inherited
  258. // them already, so we should close them so we do not leak descriptors.
  259. for (i=0;i<=2;i++) {
  260. if( fds[i] >= 0 ) {
  261. close( fds[i] );
  262. }
  263. }
  264. if( JobPid == FALSE ) {
  265. JobPid = -1;
  266. if( create_process_error ) {
  267. MyString err_msg;
  268. err_msg.sprintf( "Failed to execute '%s': %s",
  269. args_string.Value(), create_process_error );
  270. Starter->jic->notifyStarterError( err_msg.Value(), true, CONDOR_HOLD_CODE_FailedToCreateProcess, create_process_errno );
  271. }
  272. EXCEPT( "Create_Process(%s, ...) failed", args_string.Value() );
  273. return FALSE;
  274. } else {
  275. dprintf( D_ALWAYS, "Create_Process succeeded, pid=%d\n", JobPid );
  276. job_start_time.getTime();
  277. return TRUE;
  278. }
  279. }
  280. bool
  281. ToolDaemonProc::JobReaper(int pid, int status)
  282. {
  283. dprintf( D_FULLDEBUG, "Inside ToolDaemonProc::JobReaper()\n" );
  284. // If the tool exited, we want to shutdown everything.
  285. if (JobPid == pid) {
  286. if (daemonCore->Kill_Family(JobPid) == FALSE) {
  287. dprintf(D_ALWAYS,
  288. "error killing process family for job cleanup\n");
  289. }
  290. }
  291. return UserProc::JobReaper(pid, status);
  292. }
  293. // We don't have to do anything special to notify a shadow that we've
  294. // really exited.
  295. bool
  296. ToolDaemonProc::JobExit( void )
  297. {
  298. return true;
  299. }
  300. void
  301. ToolDaemonProc::Suspend()
  302. {
  303. dprintf(D_FULLDEBUG,"in ToolDaemonProc::Suspend()\n");
  304. // suspend the tool daemon job
  305. if ( JobPid != -1 ) {
  306. if (daemonCore->Suspend_Family(JobPid) == FALSE) {
  307. dprintf(D_ALWAYS,
  308. "error suspending process family\n");
  309. }
  310. }
  311. // set our flag
  312. job_suspended = true;
  313. }
  314. void
  315. ToolDaemonProc::Continue()
  316. {
  317. dprintf(D_FULLDEBUG,"in ToolDaemonProc::Continue()\n");
  318. // resume user job
  319. if ( JobPid != -1 && job_suspended ) {
  320. if (daemonCore->Continue_Family(JobPid) == FALSE) {
  321. dprintf(D_ALWAYS, "error continuing process family\n");
  322. }
  323. }
  324. // set our flag
  325. job_suspended = false;
  326. }
  327. bool
  328. ToolDaemonProc::ShutdownGraceful()
  329. {
  330. dprintf(D_FULLDEBUG,"in ToolDaemonProc::ShutdownGraceful()\n");
  331. if ( JobPid == -1 ) {
  332. // there is no process family yet, probably because we are still
  333. // transferring files. just return true to say we're all done,
  334. // and that way the starter class will simply delete us and the
  335. // FileTransfer destructor will clean up.
  336. return true;
  337. }
  338. // WE USED TO...
  339. //
  340. // take a snapshot before we softkill the parent job process.
  341. // this helps ensure that if the parent exits without killing
  342. // the kids, our JobExit() handler will get em all.
  343. //
  344. // TODO: should we explicitly call out to the procd here to tell
  345. // it to take a snapshot???
  346. // now softkill the parent job process.
  347. if( job_suspended ) {
  348. Continue();
  349. }
  350. // requested_exit = true;
  351. daemonCore->Send_Signal( JobPid, soft_kill_sig );
  352. return false; // return false says shutdown is pending
  353. }
  354. bool
  355. ToolDaemonProc::ShutdownFast()
  356. {
  357. dprintf(D_FULLDEBUG,"in ToolDaemonProc::ShutdownFast()\n");
  358. if ( JobPid == -1 ) {
  359. // there is no process family yet, probably because we are still
  360. // transferring files. just return true to say we're all done,
  361. // and that way the starter class will simply delete us and the
  362. // FileTransfer destructor will clean up.
  363. return true;
  364. }
  365. // We purposely do not do a SIGCONT here, since there is no sense
  366. // in potentially swapping the job back into memory if our next
  367. // step is to hard kill it.
  368. // requested_exit = true;
  369. if (daemonCore->Kill_Family(JobPid) == FALSE) {
  370. dprintf(D_ALWAYS,
  371. "error killing process family for fast shutdown\n");
  372. }
  373. return false; // return false says shutdown is pending
  374. }
  375. bool
  376. ToolDaemonProc::Remove()
  377. {
  378. if ( job_suspended ) {
  379. Continue();
  380. }
  381. requested_exit = true;
  382. daemonCore->Send_Signal(JobPid, rm_kill_sig);
  383. return false; // return false says shutdown is pending
  384. }
  385. bool
  386. ToolDaemonProc::Hold()
  387. {
  388. if ( job_suspended ) {
  389. Continue();
  390. }
  391. requested_exit = true;
  392. daemonCore->Send_Signal(JobPid, hold_kill_sig);
  393. return false; // return false says shutdown is pending
  394. }
  395. bool
  396. ToolDaemonProc::PublishUpdateAd( ClassAd* /*ad*/ )
  397. {
  398. dprintf( D_FULLDEBUG, "Inside ToolDaemonProc::PublishUpdateAd()\n" );
  399. // Nothing special for us to do.
  400. return true;
  401. }