PageRenderTime 68ms CodeModel.GetById 39ms RepoModel.GetById 1ms app.codeStats 0ms

/src/python/WMComponent/AlertGenerator/Pollers/Couch.py

https://github.com/PerilousApricot/WMCore-OLDOLD
Python | 291 lines | 258 code | 9 blank | 24 comment | 0 complexity | 505f7696d9df43221d180293714df080 MD5 | raw file
  1. """
  2. Module for all CouchDb related polling.
  3. """
  4. import os
  5. import logging
  6. import types
  7. import time
  8. from WMCore.Alerts.ZMQ.Sender import Sender
  9. from WMCore.Database.CMSCouch import CouchServer
  10. from WMComponent.AlertGenerator.Pollers.Base import PeriodPoller
  11. from WMComponent.AlertGenerator.Pollers.Base import Measurements
  12. from WMCore.Alerts.Alert import Alert
  13. from WMComponent.AlertGenerator.Pollers.Base import BasePoller
  14. from WMComponent.AlertGenerator.Pollers.Base import ProcessDetail
  15. from WMComponent.AlertGenerator.Pollers.System import DirectorySizePoller
  16. from WMComponent.AlertGenerator.Pollers.System import ProcessCPUPoller
  17. from WMComponent.AlertGenerator.Pollers.System import ProcessMemoryPoller
  18. # TODO
  19. # sending initialisation alerts shall be factored out above, likely into
  20. # BaseSender - with proper comment - such Sender is made and used from the
  21. # initialisation process unlike the other Sender from polling process
  22. class CouchPoller(PeriodPoller):
  23. """
  24. Common class for Couch CPU, memory utilisation monitoring and possibly
  25. further future properties.
  26. """
  27. def __init__(self, config, generator):
  28. PeriodPoller.__init__(self, config, generator)
  29. # ProcessDetail class instance (main process plus subprocesses)
  30. self._dbProcessDetail = None
  31. # instance of Measurements
  32. self._measurements = None
  33. self._setUp()
  34. def _getProcessPID(self):
  35. """
  36. Return the PID number of the Couch main server process.
  37. Standard / default location of log / PID files is:
  38. /var/log/couchdb/<ver>/couch.log
  39. /var/run/couchdb/couchdb.pid
  40. WMAgent Deployment/wmagent/manage defines $INSTALL_COUCH/logs/
  41. in which log files and PID file are stored.
  42. First try to read '/_config' query result and check if
  43. $INSTALL_COUCH/logs/couchdb.pid exists (location of PID file derived
  44. from log file location / log directory).
  45. If such PID file does not exist, try default Couch PID file location.
  46. """
  47. pidFileName = "couchdb.pid"
  48. pidFileDefault = os.path.join("/var/run/couchdb", pidFileName)
  49. try:
  50. couchURL = getattr(self.config, "couchURL", None)
  51. if not couchURL:
  52. raise Exception("Configuration value 'couchURL' missing, can't connect to Couch.")
  53. couch = CouchServer(couchURL)
  54. r = couch.makeRequest("/_config")
  55. logFile = r["log"]["file"]
  56. # derive location of the PID file from full path log file name
  57. dir = os.path.dirname(logFile)
  58. pidFile = os.path.join(dir, pidFileName)
  59. if os.path.exists(pidFile):
  60. pidStr = open(pidFile, 'r').read()
  61. pid = int(pidStr)
  62. return pid
  63. else:
  64. pidStr = open(pidFileDefault, 'r').read()
  65. pid = int(pidStr)
  66. return pid
  67. except Exception, ex:
  68. logging.error("%s: could not get CouchDB PID, reason: %s" %
  69. (self.__class__.__name__, ex))
  70. raise
  71. def _setUp(self):
  72. """
  73. Query the database to find out the main process PID,
  74. create ProcessDetail and Measurements instances.
  75. """
  76. try:
  77. pid = self._getProcessPID()
  78. self._dbProcessDetail = ProcessDetail(pid, "CouchDB")
  79. except Exception, ex:
  80. msg = ("%s: polling not possible, reason: %s" % (self.__class__.__name__, ex))
  81. logging.error(msg)
  82. # send one-off set up alert, instantiate ad-hoc alert Sender
  83. sender = Sender(self.generator.config.Alert.address,
  84. self.__class__.__name__,
  85. self.generator.config.Alert.controlAddr)
  86. a = Alert(**self.preAlert)
  87. a["Source"] = self.__class__.__name__
  88. a["Timestamp"] = time.time()
  89. a["Details"] = dict(msg = msg)
  90. a["Level"] = 10
  91. sender(a)
  92. return
  93. numOfMeasurements = round(self.config.period / self.config.pollInterval, 0)
  94. self._measurements = Measurements(numOfMeasurements)
  95. def check(self):
  96. """
  97. Above, the database server psutil.Process instance creation may have
  98. failed. Proceed with checking only if the instance exists.
  99. """
  100. if self._dbProcessDetail:
  101. PeriodPoller.check(self, self._dbProcessDetail, self._measurements)
  102. class CouchDbSizePoller(DirectorySizePoller):
  103. """
  104. Class implements monitoring / polling of the CouchDb database size.
  105. """
  106. def __init__(self, config, generator):
  107. DirectorySizePoller.__init__(self, config, generator)
  108. self._query = "/_config" # couch query to retrieve configuration info
  109. # database directory to monitor
  110. self._dbDirectory = self._getDbDir()
  111. def _getDbDir(self):
  112. """
  113. Connect to CouchDb instance and query its database directory name.
  114. """
  115. try:
  116. couchURL = getattr(self.config, "couchURL", None)
  117. couch = CouchServer(couchURL)
  118. r = couch.makeRequest(self._query)
  119. dataDir = r["couchdb"]["database_dir"]
  120. except Exception, ex:
  121. msg = ("%s: could not find out database directory, reason: %s" %
  122. (self.__class__.__name__, ex))
  123. logging.error(msg)
  124. # send one-off set up alert, instantiate ad-hoc alert Sender
  125. sender = Sender(self.generator.config.Alert.address,
  126. self.__class__.__name__,
  127. self.generator.config.Alert.controlAddr)
  128. a = Alert(**self.preAlert)
  129. a["Source"] = self.__class__.__name__
  130. a["Timestamp"] = time.time()
  131. a["Details"] = dict(msg = msg)
  132. a["Level"] = 10
  133. sender(a)
  134. dataDir = None
  135. return dataDir
  136. class CouchMemoryPoller(CouchPoller):
  137. """
  138. CouchDB memory utilisation poller.
  139. """
  140. def __init__(self, config, generator):
  141. CouchPoller.__init__(self, config, generator)
  142. @staticmethod
  143. def sample(processDetail):
  144. """
  145. Return a single float representing percentage usage of the main
  146. memory by the process.
  147. """
  148. return ProcessMemoryPoller.sample(processDetail)
  149. class CouchCPUPoller(CouchPoller):
  150. """
  151. Monitoring of CouchDb CPU usage. Monitors the main processes
  152. and its subprocesses.
  153. """
  154. def __init__(self, config, generator):
  155. CouchPoller.__init__(self, config, generator)
  156. @staticmethod
  157. def sample(processDetail):
  158. """
  159. Return a single float representing CPU usage of the main process
  160. and its subprocesses.
  161. """
  162. return ProcessCPUPoller.sample(processDetail)
  163. class CouchErrorsPoller(BasePoller):
  164. """
  165. Polling CouchDb statistics values - number of status error codes
  166. (configurable).
  167. """
  168. def __init__(self, config, generator):
  169. """
  170. couch - instance of CouchServer class
  171. """
  172. BasePoller.__init__(self, config, generator)
  173. self._myName = self.__class__.__name__
  174. self.couch = None
  175. self._query = "/_stats" # couch query to retrieve statistics
  176. self._setUp()
  177. def _setUp(self):
  178. """
  179. Instantiate CouchServer reference.
  180. Test connection with CouchDB (first connect and retrieve attempt).
  181. """
  182. try:
  183. couchURL = getattr(self.config, "couchURL", None)
  184. if not couchURL:
  185. raise Exception("Configuration value 'couchURL' missing, can't connect to Couch.")
  186. self.couch = CouchServer(couchURL)
  187. # retrieves result which is not used during this set up
  188. r = self.couch.makeRequest(self._query)
  189. except Exception, ex:
  190. logging.error("%s: could not connect to CouchDB, reason: %s" %
  191. (self._myName, ex))
  192. # observables shall be list-like integers
  193. if not isinstance(self.config.observables, (types.ListType, types.TupleType)):
  194. self.config.observables = tuple([self.config.observables])
  195. def sample(self, code):
  196. """
  197. Make a query to CouchDB and retrieve number of occurrences of
  198. particular HTTP code as reported by the internal statistics.
  199. If such HTTP codes has not occurred since the server start,
  200. if may not have an entry in the statistics result.
  201. code - string value of the code
  202. """
  203. response = self.couch.makeRequest(self._query)
  204. try:
  205. statusCodes = response["httpd_status_codes"]
  206. statusCode = statusCodes[code]
  207. return statusCode["current"] # another possibility to watch "count"
  208. except KeyError:
  209. return None
  210. def check(self):
  211. """
  212. Method called from the base class.
  213. Iterate over all HTTP status listed in observable config value
  214. and check number of occurrences of each by querying statistics
  215. of CouchDB.
  216. """
  217. for code in self.config.observables:
  218. occurrences = self.sample(str(code))
  219. if occurrences is not None:
  220. for threshold, level in zip(self.thresholds, self.levels):
  221. if occurrences >= threshold:
  222. details = dict(HTTPCode = code,
  223. occurrences = occurrences,
  224. threshold = threshold)
  225. a = Alert(**self.preAlert)
  226. a["Source"] = self._myName
  227. a["Timestamp"] = time.time()
  228. a["Details"] = details
  229. a["Level"] = level
  230. # #2238 AlertGenerator test can take 1 hour+ (and fail)
  231. logging.debug(a)
  232. self.sender(a)
  233. break # send only one alert, critical threshold tested first
  234. m = "%s: checked code:%s current occurrences:%s" % (self._myName, code, occurrences)
  235. logging.debug(m)