PageRenderTime 228ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/checks.d/nagios.py

https://gitlab.com/meetly/dd-agent
Python | 395 lines | 288 code | 41 blank | 66 comment | 40 complexity | 23cceed90ebebcc542fa6aae243fce72 MD5 | raw file
  1. # stdlib
  2. from collections import namedtuple
  3. import re
  4. # project
  5. from checks import AgentCheck
  6. from utils.tailfile import TailFile
  7. # fields order for each event type, as named tuples
  8. EVENT_FIELDS = {
  9. 'CURRENT HOST STATE': namedtuple('E_CurrentHostState', 'host, event_state, event_soft_hard, return_code, payload'),
  10. 'CURRENT SERVICE STATE': namedtuple('E_CurrentServiceState', 'host, check_name, event_state, event_soft_hard, return_code, payload'),
  11. 'SERVICE ALERT': namedtuple('E_ServiceAlert', 'host, check_name, event_state, event_soft_hard, return_code, payload'),
  12. 'PASSIVE SERVICE CHECK': namedtuple('E_PassiveServiceCheck', 'host, check_name, return_code, payload'),
  13. 'HOST ALERT': namedtuple('E_HostAlert', 'host, event_state, event_soft_hard, return_code, payload'),
  14. # [1305744274] SERVICE NOTIFICATION: ops;ip-10-114-237-165;Metric ETL;ACKNOWLEDGEMENT (CRITICAL);notify-service-by-email;HTTP CRITICAL: HTTP/1.1 503 Service Unavailable - 394 bytes in 0.010 second response time;datadog;alq
  15. 'SERVICE NOTIFICATION': namedtuple('E_ServiceNotification', 'contact, host, check_name, event_state, notification_type, payload'),
  16. # [1296509331] SERVICE FLAPPING ALERT: ip-10-114-97-27;cassandra JVM Heap;STARTED; Service appears to have started flapping (23.4% change >= 20.0% threshold)
  17. # [1296662511] SERVICE FLAPPING ALERT: ip-10-114-97-27;cassandra JVM Heap;STOPPED; Service appears to have stopped flapping (3.8% change < 5.0% threshold)
  18. 'SERVICE FLAPPING ALERT': namedtuple('E_FlappingAlert', 'host, check_name, flap_start_stop, payload'),
  19. # Reference for external commands: http://old.nagios.org/developerinfo/externalcommands/commandlist.php
  20. # Command Format:
  21. # ACKNOWLEDGE_SVC_PROBLEM;<host_name>;<service_description>;<sticky>;<notify>;<persistent>;<author>;<comment>
  22. # [1305832665] EXTERNAL COMMAND: ACKNOWLEDGE_SVC_PROBLEM;ip-10-202-161-236;Resources ETL;2;1;0;datadog;alq checking
  23. 'ACKNOWLEDGE_SVC_PROBLEM': namedtuple('E_ServiceAck', 'host, check_name, sticky_ack, notify_ack, persistent_ack, ack_author, payload'),
  24. # Command Format:
  25. # ACKNOWLEDGE_HOST_PROBLEM;<host_name>;<sticky>;<notify>;<persistent>;<author>;<comment>
  26. 'ACKNOWLEDGE_HOST_PROBLEM': namedtuple('E_HostAck', 'host, sticky_ack, notify_ack, persistent_ack, ack_author, payload'),
  27. # Comment Format:
  28. # PROCESS_SERVICE_CHECK_RESULT;<host_name>;<service_description>;<result_code>;<comment>
  29. # We ignore it because Nagios will log a "PASSIVE SERVICE CHECK" after
  30. # receiving this, and we don't want duplicate events to be counted.
  31. 'PROCESS_SERVICE_CHECK_RESULT': False,
  32. # Host Downtime
  33. # [1297894825] HOST DOWNTIME ALERT: ip-10-114-89-59;STARTED; Host has entered a period of scheduled downtime
  34. # [1297894825] SERVICE DOWNTIME ALERT: ip-10-114-237-165;intake;STARTED; Service has entered a period of scheduled downtime
  35. 'HOST DOWNTIME ALERT': namedtuple('E_HostDowntime', 'host, downtime_start_stop, payload'),
  36. 'SERVICE DOWNTIME ALERT': namedtuple('E_ServiceDowntime', 'host, check_name, downtime_start_stop, payload'),
  37. }
  38. # Regex for the Nagios event log
  39. RE_LINE_REG = re.compile('^\[(\d+)\] EXTERNAL COMMAND: (\w+);(.*)$')
  40. RE_LINE_EXT = re.compile('^\[(\d+)\] ([^:]+): (.*)$')
  41. class Nagios(AgentCheck):
  42. NAGIOS_CONF_KEYS = [
  43. re.compile('^(?P<key>log_file)\s*=\s*(?P<value>.+)$'),
  44. re.compile('^(?P<key>host_perfdata_file_template)\s*=\s*(?P<value>.+)$'),
  45. re.compile('^(?P<key>service_perfdata_file_template)\s*=\s*(?P<value>.+)$'),
  46. re.compile('^(?P<key>host_perfdata_file)\s*=\s*(?P<value>.+)$'),
  47. re.compile('^(?P<key>service_perfdata_file)\s*=\s*(?P<value>.+)$'),
  48. ]
  49. def __init__(self, name, init_config, agentConfig, instances=None):
  50. AgentCheck.__init__(self, name, init_config, agentConfig, instances)
  51. self.nagios_tails = {}
  52. check_freq = init_config.get("check_freq", 15)
  53. if instances is not None:
  54. for instance in instances:
  55. tailers = []
  56. nagios_conf = {}
  57. instance_key = None
  58. if 'nagios_conf' in instance: # conf.d check
  59. conf_path = instance['nagios_conf']
  60. nagios_conf = self.parse_nagios_config(conf_path)
  61. instance_key = conf_path
  62. # Retrocompatibility Code
  63. elif 'nagios_perf_cfg' in instance:
  64. conf_path = instance['nagios_perf_cfg']
  65. nagios_conf = self.parse_nagios_config(conf_path)
  66. instance["collect_host_performance_data"] = True
  67. instance["collect_service_performance_data"] = True
  68. instance_key = conf_path
  69. if 'nagios_log' in instance:
  70. nagios_conf["log_file"] = instance['nagios_log']
  71. if instance_key is None:
  72. instance_key = instance['nagios_log']
  73. # End of retrocompatibility code
  74. if not nagios_conf:
  75. self.log.warning("Missing path to nagios_conf")
  76. continue
  77. if 'log_file' in nagios_conf and \
  78. instance.get('collect_events', True):
  79. self.log.debug("Starting to tail the event log")
  80. tailers.append(NagiosEventLogTailer(
  81. log_path=nagios_conf['log_file'],
  82. file_template=None,
  83. logger=self.log,
  84. hostname=self.hostname,
  85. event_func=self.event,
  86. gauge_func=self.gauge,
  87. freq=check_freq,
  88. passive_checks=instance.get('passive_checks_events', False)))
  89. if 'host_perfdata_file' in nagios_conf and \
  90. 'host_perfdata_file_template' in nagios_conf and \
  91. instance.get('collect_host_performance_data', False):
  92. self.log.debug("Starting to tail the host_perfdata file")
  93. tailers.append(NagiosHostPerfDataTailer(
  94. log_path=nagios_conf['host_perfdata_file'],
  95. file_template=nagios_conf['host_perfdata_file_template'],
  96. logger=self.log,
  97. hostname=self.hostname,
  98. event_func=self.event,
  99. gauge_func=self.gauge,
  100. freq=check_freq))
  101. if 'service_perfdata_file' in nagios_conf and \
  102. 'service_perfdata_file_template' in nagios_conf and \
  103. instance.get('collect_service_performance_data', False):
  104. self.log.debug("Starting to tail the service_perfdata file")
  105. tailers.append(NagiosServicePerfDataTailer(
  106. log_path=nagios_conf['service_perfdata_file'],
  107. file_template=nagios_conf['service_perfdata_file_template'],
  108. logger=self.log,
  109. hostname=self.hostname,
  110. event_func=self.event,
  111. gauge_func=self.gauge,
  112. freq=check_freq))
  113. self.nagios_tails[instance_key] = tailers
  114. def parse_nagios_config(self, filename):
  115. output = {}
  116. f = None
  117. try:
  118. f = open(filename)
  119. for line in f:
  120. line = line.strip()
  121. if not line:
  122. continue
  123. for key in self.NAGIOS_CONF_KEYS:
  124. m = key.match(line)
  125. if m:
  126. output[m.group('key')] = m.group('value')
  127. break
  128. return output
  129. except Exception as e:
  130. # Can't parse, assume it's just not working
  131. # Don't return an incomplete config
  132. self.log.exception(e)
  133. raise Exception("Could not parse Nagios config file")
  134. finally:
  135. if f is not None:
  136. f.close()
  137. def check(self, instance):
  138. '''
  139. Parse until the end of each tailer associated with this instance.
  140. We match instance and tailers based on the path to the Nagios configuration file
  141. Special case: Compatibility with the old conf when no conf file is specified
  142. but the path to the event_log is given
  143. '''
  144. instance_key = instance.get('nagios_conf',
  145. instance.get('nagios_perf_cfg',
  146. instance.get('nagios_log',
  147. None)))
  148. # Bad configuration: This instance does not contain any necessary configuration
  149. if not instance_key or instance_key not in self.nagios_tails:
  150. raise Exception('No Nagios configuration file specified')
  151. for tailer in self.nagios_tails[instance_key]:
  152. tailer.check()
  153. class NagiosTailer(object):
  154. def __init__(self, log_path, file_template, logger, hostname, event_func, gauge_func, freq):
  155. '''
  156. :param log_path: string, path to the file to parse
  157. :param file_template: string, format of the perfdata file
  158. :param logger: Logger object
  159. :param hostname: string, name of the host this agent is running on
  160. :param event_func: function to create event, should accept dict
  161. :param gauge_func: function to report a gauge
  162. :param freq: int, size of bucket to aggregate perfdata metrics
  163. '''
  164. self.log_path = log_path
  165. self.log = logger
  166. self.gen = None
  167. self.tail = None
  168. self.hostname = hostname
  169. self._event = event_func
  170. self._gauge = gauge_func
  171. self._line_parsed = 0
  172. self._freq = freq
  173. if file_template is not None:
  174. self.compile_file_template(file_template)
  175. self.tail = TailFile(self.log, self.log_path, self._parse_line)
  176. self.gen = self.tail.tail(line_by_line=False, move_end=True)
  177. self.gen.next()
  178. def check(self):
  179. self._line_parsed = 0
  180. # read until the end of file
  181. try:
  182. self.log.debug("Start nagios check for file %s" % (self.log_path))
  183. self.gen.next()
  184. self.log.debug("Done nagios check for file %s (parsed %s line(s))" %
  185. (self.log_path, self._line_parsed))
  186. except StopIteration, e:
  187. self.log.exception(e)
  188. self.log.warning("Can't tail %s file" % (self.log_path))
  189. def compile_file_template(self, file_template):
  190. try:
  191. # Escape characters that will be interpreted as regex bits
  192. # e.g. [ and ] in "[SERVICEPERFDATA]"
  193. regex = re.sub(r'[[\]*]', r'.', file_template)
  194. regex = re.sub(r'\$([^\$]*)\$', r'(?P<\1>[^\$]*)', regex)
  195. self.line_pattern = re.compile(regex)
  196. except Exception, e:
  197. raise InvalidDataTemplate("%s (%s)" % (file_template, e))
  198. class NagiosEventLogTailer(NagiosTailer):
  199. def __init__(self, log_path, file_template, logger, hostname, event_func,
  200. gauge_func, freq, passive_checks=False):
  201. '''
  202. :param log_path: string, path to the file to parse
  203. :param file_template: string, format of the perfdata file
  204. :param logger: Logger object
  205. :param hostname: string, name of the host this agent is running on
  206. :param event_func: function to create event, should accept dict
  207. :param gauge_func: function to report a gauge
  208. :param freq: int, size of bucket to aggregate perfdata metrics
  209. :param passive_checks: bool, enable or not passive checks events
  210. '''
  211. self.passive_checks = passive_checks
  212. super(NagiosEventLogTailer, self).__init__(
  213. log_path, file_template,
  214. logger, hostname, event_func, gauge_func, freq
  215. )
  216. def _parse_line(self, line):
  217. """Actual nagios parsing
  218. Return True if we found an event, False otherwise
  219. """
  220. # first isolate the timestamp and the event type
  221. try:
  222. self._line_parsed = self._line_parsed + 1
  223. m = RE_LINE_REG.match(line)
  224. if m is None:
  225. m = RE_LINE_EXT.match(line)
  226. if m is None:
  227. return False
  228. self.log.debug("Matching line found %s" % line)
  229. (tstamp, event_type, remainder) = m.groups()
  230. tstamp = int(tstamp)
  231. # skip passive checks reports by default for spamminess
  232. if event_type == 'PASSIVE SERVICE CHECK' and not self.passive_checks:
  233. return False
  234. # then retrieve the event format for each specific event type
  235. fields = EVENT_FIELDS.get(event_type, None)
  236. if fields is None:
  237. self.log.warning("Ignoring unknown nagios event for line: %s" % (line[:-1]))
  238. return False
  239. elif fields is False:
  240. # Ignore and skip
  241. self.log.debug("Ignoring Nagios event for line: %s" % (line[:-1]))
  242. return False
  243. # and parse the rest of the line
  244. parts = map(lambda p: p.strip(), remainder.split(';'))
  245. # Chop parts we don't recognize
  246. parts = parts[:len(fields._fields)]
  247. event = self.create_event(tstamp, event_type, self.hostname, fields._make(parts))
  248. self._event(event)
  249. self.log.debug("Nagios event: %s" % (event))
  250. return True
  251. except Exception:
  252. self.log.exception("Unable to create a nagios event from line: [%s]" % (line))
  253. return False
  254. def create_event(self, timestamp, event_type, hostname, fields):
  255. """Factory method called by the parsers
  256. """
  257. d = fields._asdict()
  258. d.update({'timestamp': timestamp,
  259. 'event_type': event_type})
  260. # if host is localhost, turn that into the internal host name
  261. host = d.get('host', None)
  262. if host == "localhost":
  263. d["host"] = hostname
  264. return d
  265. class NagiosPerfDataTailer(NagiosTailer):
  266. perfdata_field = '' # Should be overriden by subclasses
  267. metric_prefix = 'nagios'
  268. pair_pattern = re.compile(r"".join([
  269. r"'?(?P<label>[^=']+)'?=",
  270. r"(?P<value>[-0-9.]+)",
  271. r"(?P<unit>s|us|ms|%|B|KB|MB|GB|TB|c)?",
  272. r"(;(?P<warn>@?[-0-9.~]*:?[-0-9.~]*))?",
  273. r"(;(?P<crit>@?[-0-9.~]*:?[-0-9.~]*))?",
  274. r"(;(?P<min>[-0-9.]*))?",
  275. r"(;(?P<max>[-0-9.]*))?",
  276. ]))
  277. @staticmethod
  278. def underscorize(s):
  279. return s.replace(' ', '_').lower()
  280. def _get_metric_prefix(self, data):
  281. raise NotImplementedError()
  282. def _parse_line(self, line):
  283. matched = self.line_pattern.match(line)
  284. output = []
  285. if matched:
  286. self.log.debug("Matching line found %s" % line)
  287. data = matched.groupdict()
  288. metric_prefix = self._get_metric_prefix(data)
  289. # Parse the prefdata values, which are a space-delimited list of:
  290. # 'label'=value[UOM];[warn];[crit];[min];[max]
  291. perf_data = data.get(self.perfdata_field, '').split(' ')
  292. for pair in perf_data:
  293. pair_match = self.pair_pattern.match(pair)
  294. if not pair_match:
  295. continue
  296. else:
  297. pair_data = pair_match.groupdict()
  298. label = pair_data['label']
  299. timestamp = data.get('TIMET', None)
  300. if timestamp is not None:
  301. timestamp = (int(float(timestamp)) / self._freq) * self._freq
  302. value = float(pair_data['value'])
  303. device_name = None
  304. if '/' in label:
  305. # Special case: if the label begins
  306. # with a /, treat the label as the device
  307. # and use the metric prefix as the metric name
  308. metric = '.'.join(metric_prefix)
  309. device_name = label
  310. else:
  311. # Otherwise, append the label to the metric prefix
  312. # and use that as the metric name
  313. metric = '.'.join(metric_prefix + [label])
  314. host_name = data.get('HOSTNAME', self.hostname)
  315. optional_keys = ['unit', 'warn', 'crit', 'min', 'max']
  316. tags = []
  317. for key in optional_keys:
  318. attr_val = pair_data.get(key, None)
  319. if attr_val is not None and attr_val != '':
  320. tags.append("{0}:{1}".format(key, attr_val))
  321. self._gauge(metric, value, tags, host_name, device_name, timestamp)
  322. class NagiosHostPerfDataTailer(NagiosPerfDataTailer):
  323. perfdata_field = 'HOSTPERFDATA'
  324. def _get_metric_prefix(self, line_data):
  325. return [self.metric_prefix, 'host']
  326. class NagiosServicePerfDataTailer(NagiosPerfDataTailer):
  327. perfdata_field = 'SERVICEPERFDATA'
  328. def _get_metric_prefix(self, line_data):
  329. metric = [self.metric_prefix]
  330. middle_name = line_data.get('SERVICEDESC', None)
  331. if middle_name:
  332. metric.append(middle_name.replace(' ', '_').lower())
  333. return metric
  334. class InvalidDataTemplate(Exception):
  335. pass