PageRenderTime 47ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/scalr-2/tags/scalr-2.2.1/app/src/Scalr/System/Cronjob/Distributed.php

http://scalr.googlecode.com/
PHP | 464 lines | 331 code | 90 blank | 43 comment | 62 complexity | 9ae0ea8985352ead150b62a4c2e1f242 MD5 | raw file
Possible License(s): LGPL-2.1, Apache-2.0, GPL-3.0
  1. <?php
  2. class Scalr_System_Cronjob_Distributed extends Scalr_System_Cronjob_MultiProcess {
  3. const ENV_CONFIG_FILE_PROPERTY = "scalr.system.dcron.configFile";
  4. const ENV_NONE_NAME_PROPERTY = "scalr.system.dcron.nodeName";
  5. const GETOPT_CONFIG_FILE = "distributed-ini";
  6. const GETOPT_NONE_NAME = "node-name";
  7. const REGKEY_MAIN_PROCESS_PID = "main.pid";
  8. const REGKEY_COORDINATOR_PROCESS_PID = "coord.pid";
  9. private $logger;
  10. protected $zookeeper;
  11. protected $jobZPath;
  12. protected $nodeName;
  13. private $globalWorkQueue;
  14. /**
  15. * @var Scalr_System_Cronjob_Distributed_NodeRegistry
  16. */
  17. private $nodeRegistry;
  18. protected $quorum;
  19. protected $isLeader;
  20. private $leaderTimeout = 60000; // 1 minute
  21. private $leaderElection;
  22. protected $electionTimeout;
  23. protected $coordinatorSlippageLimit = 10;
  24. private $leaderMtime;
  25. private $returnedNodesQueue;
  26. private $coordinatorPid;
  27. private $coordinatorLoop;
  28. /**
  29. * @var Scalr_System_Cronjob_MultiProcess_Worker
  30. */
  31. protected $worker;
  32. /**
  33. * @var Scalr_System_Cronjob_Distributed_Elector
  34. */
  35. protected $elector;
  36. static function getConfig () {
  37. return Scalr_Util_Arrays::mergeReplaceRecursive(parent::getConfig(), array(
  38. "getoptRules" => array(
  39. self::GETOPT_CONFIG_FILE."=s" => "Distributed cronjob configuration file. Local file or URL is accepted",
  40. self::GETOPT_NONE_NAME."=s" => "Computing node name. Ex: node-1"
  41. )
  42. ));
  43. return $ret;
  44. }
  45. function startForking ($workQueue) {
  46. $this->logger->debug("Start forking");
  47. //$this->processPool->on("signal", array($this, "onSignal"));
  48. //$this->processPool->on("shutdown", array($this, "onShutdown"));
  49. $this->worker->startForking($this->processPool->workQueue);
  50. if (!$this->nodeRegistry->nodesCapacity()) {
  51. $this->logger->info("Job is not running. Intiate job and begin leader election");
  52. // Create job znode
  53. $this->zookeeper->setOrCreate($this->jobZPath, null, false);
  54. // Create leader znode
  55. $this->zookeeper->setOrCreate("{$this->jobZPath}/leader", null, false);
  56. // Register node
  57. $this->nodeRegistry->set(self::REGKEY_MAIN_PROCESS_PID, posix_getpid());
  58. try {
  59. $this->leaderElection->initiate();
  60. } catch (Scalr_Service_Zookeeper_InterruptedException $ignore) {
  61. }
  62. $this->doLeaderElection();
  63. if ($this->isLeader) {
  64. $this->worker->enqueueWork($this->globalWorkQueue);
  65. }
  66. } else {
  67. $this->logger->info("Job is already running. Put myself into returned nodes queue");
  68. $this->nodeRegistry->set(self::REGKEY_MAIN_PROCESS_PID, posix_getpid());
  69. $this->returnedNodesQueue->put($this->elector->getElectionData());
  70. }
  71. $this->forkCoordinator();
  72. //return $this->processPool->workQueue;
  73. }
  74. function endForking () {
  75. $this->logger->info("End forking. Perform cleanup");
  76. try {
  77. if ($this->zookeeper) {
  78. $this->logger->debug("Delete node from node registry");
  79. $this->nodeRegistry->deleteNode();
  80. }
  81. } catch (Exception $ignore) {}
  82. parent::endForking();
  83. }
  84. function run ($options=null) {
  85. $this->init($options);
  86. // Check that process pool is running
  87. try {
  88. $poolPid = $this->nodeRegistry->get(self::REGKEY_MAIN_PROCESS_PID);
  89. } catch (Exception $e) {
  90. $this->logger->warn(sprintf("Caught: <%s> %s. Let the process pool is not started",
  91. get_class($e), $e->getMessage()));
  92. $poolPid = 0;
  93. }
  94. if ($this->poolIsRunning($poolPid)) {
  95. // and i'm a leader node ...
  96. if ($this->nodeName == $this->zookeeper->getData("{$this->jobZPath}/leader")) {
  97. if (!$this->checkMemoryLimit()) {
  98. return;
  99. }
  100. // Enqueue work
  101. $this->worker->enqueueWork($this->globalWorkQueue);
  102. }
  103. return;
  104. }
  105. $this->processPool->start();
  106. }
  107. protected function init ($options=null) {
  108. $this->logger = Logger::getLogger(__CLASS__);
  109. // Merge configurations. this config, ini config
  110. // Get configuration filename from ENV, CLI options, own static config, default: "dcron.ini"
  111. $configFileName = $_ENV[self::ENV_CONFIG_FILE_PROPERTY];
  112. if (!$configFileName) {
  113. if ($options && $options["getopt"]) {
  114. $configFileName = $options["getopt"]->getOption(self::GETOPT_CONFIG_FILE);
  115. }
  116. }
  117. if (!$configFileName) {
  118. if ($this->config["iniFile"]) {
  119. $configFileName = $this->config["iniFile"];
  120. }
  121. }
  122. if (!$configFileName) {
  123. $configFileName = "dcron.ini";
  124. }
  125. // Load configuration
  126. $configString = @file_get_contents($configFileName);
  127. if (!$configString) {
  128. throw new Scalr_System_Cronjob_Exception(sprintf("Cannot load configuration file '%s'", $configFileName));
  129. }
  130. $iniConfig = Scalr_Util_Compat::parseIniString($configString, true);
  131. if (!$iniConfig) {
  132. throw new Scalr_System_Cronjob_Exception(sprintf("Cannot parse configuration file '%s'", $configFileName));
  133. }
  134. // XXX Temporary hack
  135. if ($iniConfig["remoteConfigUrl"]) {
  136. $this->logger->debug(sprintf("Fetch configuration from '%s'", $iniConfig["remoteConfigUrl"]));
  137. $configString = @file_get_contents($iniConfig["remoteConfigUrl"]);
  138. if (!$configString) {
  139. throw new Scalr_System_Cronjob_Exception(sprintf("Cannot load configuration file '%s'", $iniConfig["remoteConfigUrl"]));
  140. }
  141. $iniConfig = Scalr_Util_Compat::parseIniString($configString, true);
  142. if (!$iniConfig) {
  143. throw new Scalr_System_Cronjob_Exception(sprintf("Cannot parse configuration file '%s'", $iniConfig["remoteConfigUrl"]));
  144. }
  145. }
  146. // Apply configuration. Worker configuration is already applied
  147. $this->config = Scalr_Util_Arrays::mergeReplaceRecursive($iniConfig, $this->config);
  148. foreach ($this->config as $k => $v) {
  149. if (property_exists($this, $k)) {
  150. $this->{$k} = $v;
  151. }
  152. }
  153. // Get nodeName from ENV, CLI options, UNIX hostname command output
  154. $nodeName = $_ENV[self::ENV_NONE_NAME_PROPERTY];
  155. if (!$nodeName) {
  156. if ($options && $options["getopt"]) {
  157. $nodeName = $options["getopt"]->getOption(self::GETOPT_NONE_NAME);
  158. }
  159. }
  160. if (!$nodeName) {
  161. $shell = new Scalr_System_Shell();
  162. $nodeName = php_uname("n");
  163. }
  164. if (!$nodeName) {
  165. throw new Scalr_System_Cronjob_Exception('Cannot detect current nodeName. '
  166. . 'Use $_ENV or CLI options to setup nodeName');
  167. }
  168. $this->nodeName = $nodeName;
  169. $this->logger->info(sprintf("Initialize distributed cronjob (nodeName: %s, quorum: %d, distributedConfig: %s)",
  170. $this->nodeName, $this->config["quorum"], $configFileName));
  171. // Create elector
  172. $electorCls = $this->config["electorCls"];
  173. if (!$electorCls) {
  174. $electorCls = "Scalr_System_Cronjob_Distributed_DefaultElector";
  175. }
  176. $this->logger->info("Set elector: {$electorCls}");
  177. $this->elector = new $electorCls ($this->nodeName, $this->config);
  178. // ZOO
  179. $this->jobZPath = "{$this->config["jobsZPath"]}/{$this->jobName}";
  180. $this->zookeeper = new Scalr_Service_Zookeeper($this->config["zookeeper"]);
  181. $this->nodeRegistry = new Scalr_System_Cronjob_Distributed_NodeRegistry(array(
  182. "zookeeper" => $this->zookeeper,
  183. "path" => "{$this->jobZPath}/nodes",
  184. "node" => $this->nodeName
  185. ));
  186. $this->leaderElection = new Scalr_Service_Zookeeper_Election(array(
  187. "zookeeper" => $this->zookeeper,
  188. "path" => "{$this->jobZPath}/election",
  189. "timeout" => $this->electionTimeout,
  190. "quorum" => $this->quorum
  191. ));
  192. $this->returnedNodesQueue = new Scalr_Service_Zookeeper_Queue(array(
  193. "zookeeper" => $this->zookeeper,
  194. "path" => "{$this->jobZPath}/returned-queue"
  195. ));
  196. // Work queue
  197. $this->globalWorkQueue = new Scalr_Service_Zookeeper_Queue(array(
  198. "zookeeper" => $this->zookeeper,
  199. "path" => $this->jobZPath . "/work-queue"
  200. ));
  201. // Local queue
  202. $this->config["processPool"]["workQueue"] = new Scalr_System_Ipc_ShmQueue(array(
  203. "name" => "scalr.system.cronjob.multiprocess.workQueue-" . posix_getpid(),
  204. "blocking" => true,
  205. "autoInit" => true
  206. ));
  207. // Call parent initialization
  208. parent::init($options);
  209. }
  210. private function doLeaderElection () {
  211. $this->logger->info("Do leader election");
  212. try {
  213. $this->logger->info("Sending my vote");
  214. $this->leaderElection->vote($this->elector->getElectionData());
  215. } catch (Scalr_Util_TimeoutException $e) {
  216. $this->logger->error(sprintf("Timeout exceed (%s) while waiting for election complete",
  217. $this->leaderElection->timeout->format()));
  218. }
  219. $this->checkElectionResults($this->leaderElection->getVotes(), true);
  220. }
  221. private function checkElectionResults ($votes, $updateQuorum=true) {
  222. $this->logger->debug("Check election results");
  223. if (!is_array($votes)) {
  224. throw new Scalr_Service_Zookeeper_Exception("Argument '\$votes' must be array");
  225. }
  226. $leaderPath = "{$this->jobZPath}/leader";
  227. $leaderNode = $this->elector->determineLeaderNode($votes);
  228. if (!$leaderNode) {
  229. $leaderTimeout = new Scalr_Util_Timeout($this->leaderTimeout);
  230. $this->logger->warn(sprintf("Elector cannot determine a leader node. "
  231. . "Cronjob will wait %s and do another election", $leaderTimeout->format()));
  232. }
  233. $oldIsLeader = $this->isLeader;
  234. $this->isLeader = $leaderNode == $this->nodeName;
  235. if ($this->isLeader || $oldIsLeader) {
  236. $this->zookeeper->setOrCreate($leaderPath, "{$leaderNode}");
  237. $this->leaderMtime = $this->zookeeper->get($leaderPath)->mtime;
  238. }
  239. $this->logger->info($this->isLeader ? "I'm a leader!" : "O-ho-ho... I'm slave :(");
  240. }
  241. private function forkCoordinator () {
  242. $this->logger->info("Forking coordinator process");
  243. $pid = pcntl_fork();
  244. if ($pid > 0) {
  245. $this->coordinatorPid = $pid;
  246. } else if ($pid == 0) {
  247. $this->coordinatorLoop = true;
  248. $this->coordinatorPid = posix_getpid();
  249. $ppid = posix_getppid();
  250. $this->nodeRegistry->set(self::REGKEY_COORDINATOR_PROCESS_PID, posix_getpid());
  251. $leaderPath = "{$this->jobZPath}/leader";
  252. $leaderTimeout = new Scalr_Util_Timeout($this->leaderTimeout);
  253. $zombyTimeout = new Scalr_Util_Timeout((int)$this->config["tickTime"]*10);
  254. $heartbeatTimeout = new Scalr_Util_Timeout((int)$this->config["tickTime"]);
  255. // Track mtime from self node
  256. $lastMtime = $this->zookeeper->get("{$this->nodeRegistry->path}/{$this->nodeRegistry->node}")->mtime;
  257. while ($this->coordinatorLoop) {
  258. $leaderTimeout->reset();
  259. try {
  260. $exceptionCounter = 0;
  261. while (!$leaderTimeout->reached() && $this->coordinatorLoop) {
  262. try {
  263. // Terminate myself if parent was killed
  264. if (!posix_kill($ppid, 0)) {
  265. $this->coordinatorLoop = false;
  266. break 2;
  267. }
  268. // Leader election maybe initiated
  269. if ($this->leaderElection->isInitiated()) {
  270. $this->logger->info("[coordinator] Someone has initiated leader election");
  271. $this->doLeaderElection();
  272. }
  273. // Leader may changed
  274. $leaderNodeName = $this->zookeeper->getData($leaderPath);
  275. $oldIsLeader = $this->isLeader;
  276. $this->isLeader = $leaderNodeName == $this->nodeName;
  277. if (!$this->isLeader && $oldIsLeader) {
  278. $this->logger->info("[coordinator] I am not longer a leader ('$this->nodeName'). "
  279. . "Leader is '$leaderNodeName'");
  280. }
  281. // Check leader znode mtime
  282. $leaderStat = $this->zookeeper->get($leaderPath);
  283. if ($leaderStat->mtime != $this->leaderMtime) {
  284. // Leader had updated it's state
  285. $leaderTimeout->reset();
  286. $this->logger->info("[coordinator] Leader is the same");
  287. $this->leaderMtime = $leaderStat->mtime;
  288. }
  289. if ($this->isLeader) {
  290. // Process returned nodes.
  291. // Administrator's configured leader may be here
  292. if ($c = $this->returnedNodesQueue->capacity()) {
  293. $this->logger->info(sprintf("%d node(s) have returned back online", $c));
  294. $votes = array($this->elector->getElectionData());
  295. while ($vote = $this->returnedNodesQueue->peek()) {
  296. $votes[] = $vote;
  297. }
  298. $this->checkElectionResults($votes, false);
  299. }
  300. // Check zomby nodes
  301. if ($zombyTimeout->reached(false)) {
  302. $childData = $this->zookeeper->getChildren($this->nodeRegistry->path);
  303. foreach ($childData->children as $childName) {
  304. $childStat = $this->zookeeper->get("{$this->nodeRegistry->path}/{$childName}");
  305. if ($childStat->mtime < $lastMtime) {
  306. // Zomby detected
  307. $this->logger->info(sprintf("[coordinator] Cleanup zomby node '%s'", $childName));
  308. $this->zookeeper->deleteRecursive("{$this->nodeRegistry->path}/{$childName}");
  309. }
  310. }
  311. $zombyTimeout->reset();
  312. $lastMtime = $this->zookeeper->get("{$this->nodeRegistry->path}/{$this->nodeRegistry->node}")->mtime;
  313. }
  314. }
  315. // Node heart beat
  316. if ($heartbeatTimeout->reached(false)) {
  317. $this->logger->debug(sprintf("[coordinator] '%s' heartbeat", $this->nodeName));
  318. $this->nodeRegistry->touchNode();
  319. $heartbeatTimeout->reset();
  320. }
  321. // Poll work queue
  322. while ($message = $this->globalWorkQueue->peek()) {
  323. $this->logger->info("[coordinator] Put received message into local queue");
  324. $this->processPool->workQueue->put($message);
  325. }
  326. Scalr_Util_Timeout::sleep(1000);
  327. } catch (Exception $e) {
  328. $this->logger->error(sprintf("[coordinator] Caught in message loop <%s> %s",
  329. get_class($e), $e->getMessage()));
  330. if (++$exceptionCounter > $this->coordinatorSlippageLimit) {
  331. $this->logger->fatal("[coordinator] Got too many consistent exceptions in main loop. "
  332. . "Slippage limit: {$this->coordinatorSlippageLimit} exceed");
  333. posix_kill(posix_getppid(), SIGTERM);
  334. exit();
  335. }
  336. }
  337. }
  338. } catch (Scalr_Util_TimeoutException $e) {
  339. $this->logger->warn("[coordinator] Caught leader timeout exception ({$leaderTimeout->format()})");
  340. $this->logger->info("[coordinator] Start new leader election procedure");
  341. try {
  342. $this->leaderElection->initiate($this->nodeRegistry->nodesCapacity());
  343. } catch (Exception $e) {
  344. $this->logger->error(sprintf("[coordinator] Caught in leader election <%s> %s",
  345. get_class($e), $e->getMessage()));
  346. }
  347. }
  348. }
  349. $this->logger->info("[coordinator] Done");
  350. exit();
  351. } else if ($pid == -1) {
  352. throw new Scalr_System_Cronjob_Exception("Cannot fork coordinator process");
  353. }
  354. }
  355. // ProcessPool shutdown event handler
  356. function onShutdown ($pool) {
  357. if ($this->coordinatorPid) {
  358. $this->logger->info("Send SIGTERM -> coordinator (pid: {$this->coordinatorPid})");
  359. posix_kill($this->coordinatorPid, SIGTERM);
  360. }
  361. }
  362. // ProcessPool signal event handler
  363. function onSignal ($pool, $signal) {
  364. parent::onSignal($pool, $signal);
  365. switch ($signal) {
  366. case SIGTERM:
  367. if (posix_getpid() == $this->coordinatorPid) {
  368. $this->logger->info("Handle SIGTERM in coordinator");
  369. $this->coordinatorLoop = false;
  370. }
  371. break;
  372. }
  373. }
  374. }