PageRenderTime 44ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/manager/manager.cc

http://github.com/yahoo/Pluton
C++ | 420 lines | 249 code | 87 blank | 84 comment | 43 complexity | b45690c0f7c694ba45f74babec4eca73 MD5 | raw file
  1. /*
  2. Copyright (c) 2010, Yahoo! Inc. All rights reserved.
  3. Redistribution and use of this software in source and binary forms, with or
  4. without modification, are permitted provided that the following conditions are
  5. met:
  6. * Redistributions of source code must retain the above copyright notice, this
  7. list of conditions and the following disclaimer.
  8. * Redistributions in binary form must reproduce the above copyright notice,
  9. this list of conditions and the following disclaimer in the documentation and/or
  10. other materials provided with the distribution.
  11. * Neither the name of Yahoo! Inc. nor the names of its contributors may be used
  12. to endorse or promote products derived from this software without specific prior
  13. written permission of Yahoo! Inc.
  14. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  15. ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  16. WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  17. DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  18. ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  19. (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  20. LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
  21. ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  22. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  23. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. */
  25. #include "config.h"
  26. #include <iostream>
  27. #include <sstream>
  28. #include <string>
  29. #include <sys/param.h>
  30. #include <sys/types.h>
  31. #include <sys/stat.h>
  32. #include <sys/wait.h>
  33. #include <assert.h>
  34. #include <fcntl.h>
  35. #include <stdio.h>
  36. #include <stdlib.h>
  37. #include <unistd.h>
  38. #include <st.h>
  39. #include "debug.h"
  40. #include "logging.h"
  41. #include "util.h"
  42. #include "global.h"
  43. #include "commandPort.h"
  44. #include "listenInterface.h"
  45. #include "listenBacklog.h"
  46. #include "shmLookup.h"
  47. #include "pidMap.h"
  48. #include "manager.h"
  49. #include "process.h"
  50. #include "service.h"
  51. #include "serviceKey.h"
  52. using namespace std;
  53. //////////////////////////////////////////////////////////////////////
  54. // The manager is the controlling state-thread. It establishes all the
  55. // necessary framework to start service processes and log results. The
  56. // main role of the manager thread while running is to distribute
  57. // child exit signals and test for config changes.
  58. //////////////////////////////////////////////////////////////////////
  59. manager::manager()
  60. : _commandPortInterface(0), _configurationDirectory("."), _rendezvousDirectory("."),
  61. _lookupMapFile("./lookup.map"),
  62. _emergencyExitDelay(30),
  63. _statisticsLogInterval(600), _defaultUID(-1), _defaultGID(-1), _stStackSize(0),
  64. _logStatsFlag(false),
  65. _configurationReloadFlag(false), _configurationReloadAfter(0),
  66. _reapChildrenFlag(false),
  67. _quitMessage(0), _commandAcceptSocket(-1),
  68. _serviceCount(0),
  69. _activeProcessCount(0), _maximumProcessCount(0), _childCount(0), _zombieCount(0),
  70. _processAdded(0), _requestsReported(0),
  71. _serviceMap(new serviceMapType), _shmLookupPtr(new shmLookup),
  72. _startTime(time(0)), _forkLimiter(3)
  73. {
  74. zeroPeriodicCounts();
  75. }
  76. //////////////////////////////////////////////////////////////////////
  77. manager::~manager()
  78. {
  79. if (_shmLookupPtr) delete _shmLookupPtr;
  80. if (_serviceMap) delete _serviceMap;
  81. }
  82. //////////////////////////////////////////////////////////////////////
  83. bool
  84. manager::initialize()
  85. {
  86. setThreadID(st_thread_self());
  87. if (!_LB.initialize(_errorMessage)) return false; // Listen Backlog
  88. if (!checkRendezvousDirectory()) return false;
  89. return true;
  90. }
  91. //////////////////////////////////////////////////////////////////////
  92. // Make sure - as best we can - that the Rendezvous directory is
  93. // accessible. Also, make it absolute if it's a relative path as it
  94. // forms part of the path in the lookupMap.
  95. //////////////////////////////////////////////////////////////////////
  96. bool
  97. manager::checkRendezvousDirectory()
  98. {
  99. struct stat sb;
  100. const char* rvPath = _rendezvousDirectory.c_str();
  101. if (stat(rvPath, &sb) == -1) {
  102. util::messageWithErrno(_errorMessage, "Could not stat() rendezvousDirectory", rvPath);
  103. return false;
  104. }
  105. if (!(sb.st_mode & S_IFDIR)) {
  106. errno = ENOTDIR;
  107. util::messageWithErrno(_errorMessage, "rendezvousDirectory is not a directory", rvPath);
  108. return false;
  109. }
  110. if (access(rvPath, R_OK | W_OK | X_OK | F_OK) == -1) {
  111. util::messageWithErrno(_errorMessage, "rendezvousDirectory inaccessible", rvPath);
  112. return false;
  113. }
  114. // If the path is absolute we're done
  115. if (*rvPath == '/') return true;
  116. // Convert relative to absolute - as best we can. Note the the RHEL
  117. // manpage warns against realpath(3) - but I think the rationale is
  118. // largely specious.
  119. char workBuffer[MAXPATHLEN * 4];
  120. workBuffer[MAXPATHLEN * 4 - 3] = '\3'; // Put up a small picket fence
  121. workBuffer[MAXPATHLEN * 4 - 2] = '\2'; // just for the parnoid RHEL folk
  122. workBuffer[MAXPATHLEN * 4 - 1] = '\1';
  123. _rendezvousDirectory = realpath(rvPath, workBuffer);
  124. assert(workBuffer[MAXPATHLEN * 4 - 3] == '\3'); // Check that the fence
  125. assert(workBuffer[MAXPATHLEN * 4 - 2] == '\2'); // is still in good
  126. assert(workBuffer[MAXPATHLEN * 4 - 1] == '\1'); // standing.
  127. LOGPRT << "Manager Rendezvous realpath: " << _rendezvousDirectory << endl;
  128. return true;
  129. }
  130. //////////////////////////////////////////////////////////////////////////
  131. // Initialize the listening socket for the command port. Return false
  132. // on any failure with an error message.
  133. //////////////////////////////////////////////////////////////////////////
  134. bool
  135. manager::initializeCommandPort(std::string& em)
  136. {
  137. if (!_commandPortInterface) return true;
  138. listenInterface li;
  139. if (li.openAndListen(_commandPortInterface)) {
  140. util::messageWithErrno(em, 0, _commandPortInterface);
  141. return false;
  142. }
  143. _commandAcceptSocket = li.transferFD(); // Take ownership of the FD
  144. if (util::setCloseOnExec(_commandAcceptSocket) == -1) {
  145. util::messageWithErrno(em, "Could not set FD_CLOEXEC on Command Accept Socket",
  146. _commandPortInterface);
  147. close(_commandAcceptSocket);
  148. _commandAcceptSocket = -1;
  149. return false;
  150. }
  151. if (!st_thread_create(commandPort::listen, static_cast<void*>(this), 0, _stStackSize)) {
  152. util::messageWithErrno(em, "st_thread_create(commandPort::listen) failed");
  153. close(_commandAcceptSocket);
  154. _commandAcceptSocket = -1;
  155. return false;
  156. }
  157. return true;
  158. }
  159. //////////////////////////////////////////////////////////////////////
  160. void
  161. manager::initiateShutdownSequence(const char* reason)
  162. {
  163. if (debug::manager()) DBGPRT << "manager::initiateShutdownSequence() " << _logID << endl;
  164. if (shutdownInProgress()) return;
  165. LOGPRT << "Manager shutdown: " << reason << endl;
  166. baseInitiateShutdownSequence();
  167. for (serviceMapIter mi=_serviceMap->begin(); mi!=_serviceMap->end(); ++mi) {
  168. mi->second->initiateShutdownSequence(reason);
  169. }
  170. }
  171. //////////////////////////////////////////////////////////////////////
  172. // Report on residual object count. They should all be zero if the
  173. // classes are destroying their objects correctly.
  174. //////////////////////////////////////////////////////////////////////
  175. void
  176. manager::completeShutdownSequence()
  177. {
  178. LOGPRT << "Final Objects:"
  179. << " Map=" << _serviceMap->size()
  180. << " Services=" << service::getCurrentObjectCount()
  181. << " Process=" << process::getCurrentObjectCount()
  182. << " Children=" << _childCount
  183. << " Zombies=" << _zombieCount
  184. << endl;
  185. }
  186. //////////////////////////////////////////////////////////////////////
  187. // When destroing a service, if it has an entry in the serviceMap then
  188. // the serviceMap entry and the corresponding acceptPath are removed
  189. // otherwise the service has been replaced and the Map and path
  190. // remain. This ensures that a service is always available to clients,
  191. // even when it is being replaced due to an updated config.
  192. //////////////////////////////////////////////////////////////////////
  193. void
  194. manager::destroyOffspring(threadedObject* to, const char* reason)
  195. {
  196. service* S = dynamic_cast<service*>(to);
  197. const service* matchingS = findServiceInServiceMap(S->getName());
  198. if (!matchingS || (matchingS == S)) {
  199. if (matchingS) _serviceMap->erase(S->getName());
  200. S->removeAcceptPath();
  201. S->removeServiceMap();
  202. }
  203. --_serviceCount;
  204. delete S;
  205. notifyOwner("manager", "destroyOffspring"); // Tell owner so it can recalibrate/exit
  206. }
  207. //////////////////////////////////////////////////////////////////////
  208. void
  209. manager::addProcessCount()
  210. {
  211. ++_activeProcessCount;
  212. _maximumProcessCount = max(_maximumProcessCount, _activeProcessCount);
  213. }
  214. void
  215. manager::subtractProcessCount(processExit::reason res)
  216. {
  217. --_activeProcessCount;
  218. ++_exitReasonCount[res];
  219. }
  220. void
  221. manager::zeroPeriodicCounts()
  222. {
  223. for (int ix=processExit::noReason; ix<processExit::maxReasonCount; ++ix) {
  224. _exitReasonCount[ix] = 0;
  225. }
  226. }
  227. //////////////////////////////////////////////////////////////////////
  228. void
  229. manager::setQuitMessage(const char* m)
  230. {
  231. _quitMessage = m;
  232. notifyOwner("manager", m);
  233. }
  234. void
  235. manager::setConfigurationReload(bool newState, time_t changesSince)
  236. {
  237. _configurationReloadFlag = newState;
  238. _configurationReloadAfter = changesSince;
  239. }
  240. //////////////////////////////////////////////////////////////////////
  241. // Rebuild the shared memory lookup structure based on the service
  242. // map. The Service Key and the Search Key are *not* the
  243. // same. serviceKey() knows how to construct the Search Key.
  244. //
  245. // The keys are first constructed in a temporary hash so that the size
  246. // of the shm can be determined once all keys are processed. With the
  247. // known size, the new shm is created and the temporary hashmap
  248. // details are transferred to shm.
  249. //////////////////////////////////////////////////////////////////////
  250. const char*
  251. manager::rebuildLookup()
  252. {
  253. shmLookup::mapType keysIn; // Temporary key holder
  254. for (serviceMapConstIter ix=_serviceMap->begin(); ix != _serviceMap->end(); ++ix) {
  255. service* sm = ix->second;
  256. pluton::serviceKey SK; // Construct the Search Key from the Service Key
  257. SK.parse(sm->getName(), false);
  258. string sk;
  259. SK.getSearchKey(sk);
  260. keysIn[sk] = sm->getAcceptPath(); // and add it into the temporary hash.
  261. }
  262. const char* res = _shmLookupPtr->buildMap(_lookupMapFile, keysIn);
  263. if (debug::oneShot()) _shmLookupPtr->dumpMap();
  264. return res;
  265. }
  266. //////////////////////////////////////////////////////////////////////
  267. // The main loop of the manager:
  268. // o Periodically make status reports
  269. // o Notice child exits and tell the controlling thread
  270. // o Notice config change signals and reload configurations
  271. //////////////////////////////////////////////////////////////////////
  272. bool
  273. manager::run()
  274. {
  275. time_t nextReport = 0;
  276. while (!_quitMessage) {
  277. if (_configurationReloadFlag) {
  278. LOGPRT << "Manager Signal: Configuration Reload" << endl;
  279. if (!loadConfigurations(_configurationReloadAfter)) break;
  280. _configurationReloadFlag = false;
  281. }
  282. if (_reapChildrenFlag) {
  283. _reapChildrenFlag = false;
  284. pidMap::reapChildren("manager::run");
  285. }
  286. time_t now = st_time();
  287. if (nextReport <= now) {
  288. periodicStatisticsReport(now);
  289. nextReport = now + _statisticsLogInterval;
  290. }
  291. LOGFLUSH;
  292. enableInterrupts(); // Wait for an interrupt or
  293. int res = st_usleep(util::MICROSECOND/4); // Could do the self-pipe trick to get notified
  294. if (debug::manager()) DBGPRT << "manager::st_usleep=" << res << " errno=" << errno << endl;
  295. disableInterrupts();
  296. }
  297. return false;
  298. }
  299. //////////////////////////////////////////////////////////////////////.
  300. // Give the services a chance to shutdown
  301. //////////////////////////////////////////////////////////////////////
  302. void
  303. manager::runUntilIdle()
  304. {
  305. if (debug::manager()) DBGPRT << "manager::runUntilIdle() " << _logID << endl;
  306. enableInterrupts();
  307. while (_serviceCount > 0) {
  308. st_sleep(1);
  309. if (_reapChildrenFlag) {
  310. _reapChildrenFlag = false;
  311. pidMap::reapChildren("manager::runUntilIdle");
  312. }
  313. if ((_serviceCount > 0) && debug::manager()) {
  314. DBGPRT << "Manager exit: waiting on " << _serviceCount << endl;
  315. ostringstream os;
  316. process::list(os, 0);
  317. DBGPRT << os.str() << endl;
  318. }
  319. }
  320. }
  321. //////////////////////////////////////////////////////////////////////
  322. service*
  323. manager::findServiceInServiceMap(const std::string& name)
  324. {
  325. serviceMapIter mi = _serviceMap->find(name);
  326. if (mi == _serviceMap->end()) return 0;
  327. return mi->second;
  328. }