PageRenderTime 26ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/scalr-2/tags/scalr-2.0.0/app/src/Scalr/System/Cronjob/Distributed.php

http://scalr.googlecode.com/
PHP | 470 lines | 337 code | 90 blank | 43 comment | 63 complexity | b35366519c9af3af24f080244b925efb MD5 | raw file
Possible License(s): LGPL-2.1, Apache-2.0, GPL-3.0
  1. <?php
  2. class Scalr_System_Cronjob_Distributed extends Scalr_System_Cronjob_MultiProcess {
  3. const ENV_CONFIG_FILE_PROPERTY = "scalr.system.dcron.configFile";
  4. const ENV_NONE_NAME_PROPERTY = "scalr.system.dcron.nodeName";
  5. const GETOPT_CONFIG_FILE = "distributed-ini";
  6. const GETOPT_NONE_NAME = "node-name";
  7. const REGKEY_MAIN_PROCESS_PID = "main.pid";
  8. const REGKEY_COORDINATOR_PROCESS_PID = "coord.pid";
  9. private $logger;
  10. protected $zookeeper;
  11. protected $jobZPath;
  12. protected $nodeName;
  13. private $globalWorkQueue;
  14. /**
  15. * @var Scalr_System_Cronjob_Distributed_NodeRegistry
  16. */
  17. private $nodeRegistry;
  18. protected $quorum;
  19. protected $isLeader;
  20. private $leaderTimeout = 60000; // 1 minute
  21. private $leaderElection;
  22. protected $electionTimeout;
  23. protected $coordinatorSlippageLimit = 10;
  24. private $leaderMtime;
  25. private $returnedNodesQueue;
  26. private $coordinatorPid;
  27. private $coordinatorLoop;
  28. /**
  29. * @var Scalr_System_Cronjob_MultiProcess_Worker
  30. */
  31. protected $worker;
  32. /**
  33. * @var Scalr_System_Cronjob_Distributed_Elector
  34. */
  35. protected $elector;
  36. static function getConfig () {
  37. return Scalr_Util_Arrays::mergeReplaceRecursive(parent::getConfig(), array(
  38. "getoptRules" => array(
  39. self::GETOPT_CONFIG_FILE."=s" => "Distributed cronjob configuration file. Local file or URL is accepted",
  40. self::GETOPT_NONE_NAME."=s" => "Computing node name. Ex: node-1"
  41. )
  42. ));
  43. return $ret;
  44. }
  45. function startForking ($workQueue) {
  46. $this->logger->debug("Start forking");
  47. //$this->processPool->on("signal", array($this, "onSignal"));
  48. //$this->processPool->on("shutdown", array($this, "onShutdown"));
  49. $this->worker->startForking($this->processPool->workQueue);
  50. if (!$this->nodeRegistry->nodesCapacity()) {
  51. $this->logger->info("Job is not running. Intiate job and begin leader election");
  52. // Create job znode
  53. $this->zookeeper->setOrCreate($this->jobZPath, null, false);
  54. // Create leader znode
  55. $this->zookeeper->setOrCreate("{$this->jobZPath}/leader", null, false);
  56. // Register node
  57. $this->nodeRegistry->set(self::REGKEY_MAIN_PROCESS_PID, posix_getpid());
  58. try {
  59. $this->leaderElection->initiate();
  60. } catch (Scalr_Service_Zookeeper_InterruptedException $ignore) {
  61. }
  62. $this->doLeaderElection();
  63. if ($this->isLeader) {
  64. $this->worker->enqueueWork($this->globalWorkQueue);
  65. }
  66. } else {
  67. $this->logger->info("Job is already running. Put myself into returned nodes queue");
  68. $this->nodeRegistry->set(self::REGKEY_MAIN_PROCESS_PID, posix_getpid());
  69. $this->returnedNodesQueue->put($this->elector->getElectionData());
  70. }
  71. $this->forkCoordinator();
  72. //return $this->processPool->workQueue;
  73. }
  74. function endForking () {
  75. $this->logger->info("End forking. Perform cleanup");
  76. try {
  77. if ($this->zookeeper) {
  78. $this->logger->debug("Delete node from node registry");
  79. $this->nodeRegistry->deleteNode();
  80. }
  81. } catch (Exception $ignore) {}
  82. parent::endForking();
  83. }
  84. function run ($options=null) {
  85. $this->init($options);
  86. // Check that process pool is running
  87. try {
  88. $poolPid = $this->nodeRegistry->get(self::REGKEY_MAIN_PROCESS_PID);
  89. } catch (Exception $e) {
  90. $this->logger->warn(sprintf("Caught: <%s> %s. Let the process pool is not started",
  91. get_class($e), $e->getMessage()));
  92. $poolPid = 0;
  93. }
  94. if ($this->poolIsRunning($poolPid)) {
  95. // and i'm a leader node ...
  96. if ($this->nodeName == $this->zookeeper->getData("{$this->jobZPath}/leader")) {
  97. if (!$this->checkMemoryLimit()) {
  98. return;
  99. }
  100. // Enqueue work
  101. $this->worker->enqueueWork($this->globalWorkQueue);
  102. }
  103. return;
  104. }
  105. $this->processPool->start();
  106. }
  107. protected function init ($options=null) {
  108. $this->logger = Logger::getLogger(__CLASS__);
  109. // Merge configurations. this config, ini config
  110. // Get configuration filename from ENV, CLI options, own static config, default: "dcron.ini"
  111. $configFileName = $_ENV[self::ENV_CONFIG_FILE_PROPERTY];
  112. if (!$configFileName) {
  113. if ($options && $options["getopt"]) {
  114. $configFileName = $options["getopt"]->getOption(self::GETOPT_CONFIG_FILE);
  115. }
  116. }
  117. if (!$configFileName) {
  118. if ($this->config["iniFile"]) {
  119. $configFileName = $this->config["iniFile"];
  120. }
  121. }
  122. if (!$configFileName) {
  123. $configFileName = "dcron.ini";
  124. }
  125. // Load configuration
  126. $configString = @file_get_contents($configFileName);
  127. if (!$configString) {
  128. throw new Scalr_System_Cronjob_Exception(sprintf("Cannot load configuration file '%s'", $configFileName));
  129. }
  130. $iniConfig = Scalr_Util_Compat::parseIniString($configString, true);
  131. if (!$iniConfig) {
  132. throw new Scalr_System_Cronjob_Exception(sprintf("Cannot parse configuration file '%s'", $configFileName));
  133. }
  134. // XXX Temporary hack
  135. if ($iniConfig["remoteConfigUrl"]) {
  136. $this->logger->debug(sprintf("Fetch configuration from '%s'", $iniConfig["remoteConfigUrl"]));
  137. $configString = @file_get_contents($iniConfig["remoteConfigUrl"]);
  138. if (!$configString) {
  139. throw new Scalr_System_Cronjob_Exception(sprintf("Cannot load configuration file '%s'", $iniConfig["remoteConfigUrl"]));
  140. }
  141. $iniConfig = Scalr_Util_Compat::parseIniString($configString, true);
  142. if (!$iniConfig) {
  143. throw new Scalr_System_Cronjob_Exception(sprintf("Cannot parse configuration file '%s'", $iniConfig["remoteConfigUrl"]));
  144. }
  145. }
  146. // Apply configuration. Worker configuration is already applied
  147. $this->config = Scalr_Util_Arrays::mergeReplaceRecursive($iniConfig, $this->config);
  148. foreach ($this->config as $k => $v) {
  149. if (property_exists($this, $k)) {
  150. $this->{$k} = $v;
  151. }
  152. }
  153. // Get nodeName from ENV, CLI options, UNIX hostname command output
  154. $nodeName = $_ENV[self::ENV_NONE_NAME_PROPERTY];
  155. if (!$nodeName) {
  156. if ($options && $options["getopt"]) {
  157. $nodeName = $options["getopt"]->getOption(self::GETOPT_NONE_NAME);
  158. }
  159. }
  160. if (!$nodeName) {
  161. $shell = new Scalr_System_Shell();
  162. $nodeName = php_uname("n");
  163. }
  164. if (!$nodeName) {
  165. throw new Scalr_System_Cronjob_Exception('Cannot detect current nodeName. '
  166. . 'Use $_ENV or CLI options to setup nodeName');
  167. }
  168. $this->nodeName = $nodeName;
  169. $this->logger->info(sprintf("Initialize distributed cronjob (nodeName: %s, quorum: %d, distributedConfig: %s)",
  170. $this->nodeName, $this->config["quorum"], $configFileName));
  171. // Create elector
  172. $electorCls = $this->config["electorCls"];
  173. if (!$electorCls) {
  174. $electorCls = "Scalr_System_Cronjob_Distributed_DefaultElector";
  175. }
  176. $this->logger->info("Set elector: {$electorCls}");
  177. $this->elector = new $electorCls ($this->nodeName, $this->config);
  178. // ZOO
  179. $this->jobZPath = "{$this->config["jobsZPath"]}/{$this->jobName}";
  180. $this->zookeeper = new Scalr_Service_Zookeeper($this->config["zookeeper"]);
  181. $this->nodeRegistry = new Scalr_System_Cronjob_Distributed_NodeRegistry(array(
  182. "zookeeper" => $this->zookeeper,
  183. "path" => "{$this->jobZPath}/nodes",
  184. "node" => $this->nodeName
  185. ));
  186. $this->leaderElection = new Scalr_Service_Zookeeper_Election(array(
  187. "zookeeper" => $this->zookeeper,
  188. "path" => "{$this->jobZPath}/election",
  189. "timeout" => $this->electionTimeout,
  190. "quorum" => $this->quorum
  191. ));
  192. $this->returnedNodesQueue = new Scalr_Service_Zookeeper_Queue(array(
  193. "zookeeper" => $this->zookeeper,
  194. "path" => "{$this->jobZPath}/returned-queue"
  195. ));
  196. // Work queue
  197. $this->globalWorkQueue = new Scalr_Service_Zookeeper_Queue(array(
  198. "zookeeper" => $this->zookeeper,
  199. "path" => $this->jobZPath . "/work-queue"
  200. ));
  201. // Local queue
  202. $this->config["processPool"]["workQueue"] = new Scalr_System_Ipc_ShmQueue(array(
  203. "name" => "scalr.system.cronjob.multiprocess.workQueue-" . posix_getpid(),
  204. "blocking" => true,
  205. "autoInit" => true
  206. ));
  207. if ($this->config["processPool"]["preventParalleling"]) {
  208. $this->config["processPool"]["nowWorkingSet"] = new Scalr_Service_Zookeeper_Set(array(
  209. "zookeeper" => $this->zookeeper,
  210. "path" => $this->jobZPath . "/now-working-set"
  211. ));
  212. }
  213. // Call parent initialization
  214. parent::init($options);
  215. }
  216. private function doLeaderElection () {
  217. $this->logger->info("Do leader election");
  218. try {
  219. $this->logger->info("Sending my vote");
  220. $this->leaderElection->vote($this->elector->getElectionData());
  221. } catch (Scalr_Util_TimeoutException $e) {
  222. $this->logger->error(sprintf("Timeout exceed (%s) while waiting for election complete",
  223. $this->leaderElection->timeout->format()));
  224. }
  225. $this->checkElectionResults($this->leaderElection->getVotes(), true);
  226. }
  227. private function checkElectionResults ($votes, $updateQuorum=true) {
  228. $this->logger->debug("Check election results");
  229. if (!is_array($votes)) {
  230. throw new Scalr_Service_Zookeeper_Exception("Argument '\$votes' must be array");
  231. }
  232. $leaderPath = "{$this->jobZPath}/leader";
  233. $leaderNode = $this->elector->determineLeaderNode($votes);
  234. if (!$leaderNode) {
  235. $leaderTimeout = new Scalr_Util_Timeout($this->leaderTimeout);
  236. $this->logger->warn(sprintf("Elector cannot determine a leader node. "
  237. . "Cronjob will wait %s and do another election", $leaderTimeout->format()));
  238. }
  239. $oldIsLeader = $this->isLeader;
  240. $this->isLeader = $leaderNode == $this->nodeName;
  241. if ($this->isLeader || $oldIsLeader) {
  242. $this->zookeeper->setOrCreate($leaderPath, "{$leaderNode}");
  243. $this->leaderMtime = $this->zookeeper->get($leaderPath)->mtime;
  244. }
  245. $this->logger->info($this->isLeader ? "I'm a leader!" : "O-ho-ho... I'm slave :(");
  246. }
  247. private function forkCoordinator () {
  248. $this->logger->info("Forking coordinator process");
  249. $pid = pcntl_fork();
  250. if ($pid > 0) {
  251. $this->coordinatorPid = $pid;
  252. } else if ($pid == 0) {
  253. $this->coordinatorLoop = true;
  254. $this->coordinatorPid = posix_getpid();
  255. $ppid = posix_getppid();
  256. $this->nodeRegistry->set(self::REGKEY_COORDINATOR_PROCESS_PID, posix_getpid());
  257. $leaderPath = "{$this->jobZPath}/leader";
  258. $leaderTimeout = new Scalr_Util_Timeout($this->leaderTimeout);
  259. $zombyTimeout = new Scalr_Util_Timeout((int)$this->config["tickTime"]*10);
  260. $heartbeatTimeout = new Scalr_Util_Timeout((int)$this->config["tickTime"]);
  261. // Track mtime from self node
  262. $lastMtime = $this->zookeeper->get("{$this->nodeRegistry->path}/{$this->nodeRegistry->node}")->mtime;
  263. while ($this->coordinatorLoop) {
  264. $leaderTimeout->reset();
  265. try {
  266. $exceptionCounter = 0;
  267. while (!$leaderTimeout->reached() && $this->coordinatorLoop) {
  268. try {
  269. // Terminate myself if parent was killed
  270. if (!posix_kill($ppid, 0)) {
  271. $this->coordinatorLoop = false;
  272. break 2;
  273. }
  274. // Leader election maybe initiated
  275. if ($this->leaderElection->isInitiated()) {
  276. $this->logger->info("[coordinator] Someone has initiated leader election");
  277. $this->doLeaderElection();
  278. }
  279. // Leader may changed
  280. $leaderNodeName = $this->zookeeper->getData($leaderPath);
  281. $oldIsLeader = $this->isLeader;
  282. $this->isLeader = $leaderNodeName == $this->nodeName;
  283. if (!$this->isLeader && $oldIsLeader) {
  284. $this->logger->info("[coordinator] I am not longer a leader ('$this->nodeName'). "
  285. . "Leader is '$leaderNodeName'");
  286. }
  287. // Check leader znode mtime
  288. $leaderStat = $this->zookeeper->get($leaderPath);
  289. if ($leaderStat->mtime != $this->leaderMtime) {
  290. // Leader had updated it's state
  291. $leaderTimeout->reset();
  292. $this->logger->info("[coordinator] Leader is the same");
  293. $this->leaderMtime = $leaderStat->mtime;
  294. }
  295. if ($this->isLeader) {
  296. // Process returned nodes.
  297. // Administrator's configured leader may be here
  298. if ($c = $this->returnedNodesQueue->capacity()) {
  299. $this->logger->info(sprintf("%d node(s) have returned back online", $c));
  300. $votes = array($this->elector->getElectionData());
  301. while ($vote = $this->returnedNodesQueue->peek()) {
  302. $votes[] = $vote;
  303. }
  304. $this->checkElectionResults($votes, false);
  305. }
  306. // Check zomby nodes
  307. if ($zombyTimeout->reached(false)) {
  308. $childData = $this->zookeeper->getChildren($this->nodeRegistry->path);
  309. foreach ($childData->children as $childName) {
  310. $childStat = $this->zookeeper->get("{$this->nodeRegistry->path}/{$childName}");
  311. if ($childStat->mtime < $lastMtime) {
  312. // Zomby detected
  313. $this->logger->info(sprintf("[coordinator] Cleanup zomby node '%s'", $childName));
  314. $this->zookeeper->deleteRecursive("{$this->nodeRegistry->path}/{$childName}");
  315. }
  316. }
  317. $zombyTimeout->reset();
  318. $lastMtime = $this->zookeeper->get("{$this->nodeRegistry->path}/{$this->nodeRegistry->node}")->mtime;
  319. }
  320. }
  321. // Node heart beat
  322. if ($heartbeatTimeout->reached(false)) {
  323. $this->logger->debug(sprintf("[coordinator] '%s' heartbeat", $this->nodeName));
  324. $this->nodeRegistry->touchNode();
  325. $heartbeatTimeout->reset();
  326. }
  327. // Poll work queue
  328. while ($message = $this->globalWorkQueue->peek()) {
  329. $this->logger->info("[coordinator] Put received message into local queue");
  330. $this->processPool->workQueue->put($message);
  331. }
  332. Scalr_Util_Timeout::sleep(1000);
  333. } catch (Exception $e) {
  334. $this->logger->error(sprintf("[coordinator] Caught in message loop <%s> %s",
  335. get_class($e), $e->getMessage()));
  336. if (++$exceptionCounter > $this->coordinatorSlippageLimit) {
  337. $this->logger->fatal("[coordinator] Got too many consistent exceptions in main loop. "
  338. . "Slippage limit: {$this->coordinatorSlippageLimit} exceed");
  339. posix_kill(posix_getppid(), SIGTERM);
  340. exit();
  341. }
  342. }
  343. }
  344. } catch (Scalr_Util_TimeoutException $e) {
  345. $this->logger->warn("[coordinator] Caught leader timeout exception ({$leaderTimeout->format()})");
  346. $this->logger->info("[coordinator] Start new leader election procedure");
  347. try {
  348. $this->leaderElection->initiate($this->nodeRegistry->nodesCapacity());
  349. } catch (Exception $e) {
  350. $this->logger->error(sprintf("[coordinator] Caught in leader election <%s> %s",
  351. get_class($e), $e->getMessage()));
  352. }
  353. }
  354. }
  355. $this->logger->info("[coordinator] Done");
  356. exit();
  357. } else if ($pid == -1) {
  358. throw new Scalr_System_Cronjob_Exception("Cannot fork coordinator process");
  359. }
  360. }
  361. // ProcessPool shutdown event handler
  362. function onShutdown ($pool) {
  363. if ($this->coordinatorPid) {
  364. $this->logger->info("Send SIGTERM -> coordinator (pid: {$this->coordinatorPid})");
  365. posix_kill($this->coordinatorPid, SIGTERM);
  366. }
  367. }
  368. // ProcessPool signal event handler
  369. function onSignal ($pool, $signal) {
  370. parent::onSignal($pool, $signal);
  371. switch ($signal) {
  372. case SIGTERM:
  373. if (posix_getpid() == $this->coordinatorPid) {
  374. $this->logger->info("Handle SIGTERM in coordinator");
  375. $this->coordinatorLoop = false;
  376. }
  377. break;
  378. }
  379. }
  380. }