PageRenderTime 53ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/agent.py

https://gitlab.com/meetly/dd-agent
Python | 394 lines | 350 code | 18 blank | 26 comment | 3 complexity | ea764af40552f979636fca7548a3d3ec MD5 | raw file
  1. #!/opt/datadog-agent/embedded/bin/python
  2. '''
  3. Datadog
  4. www.datadoghq.com
  5. ----
  6. Make sense of your IT Data
  7. Licensed under Simplified BSD License (see LICENSE)
  8. (C) Boxed Ice 2010 all rights reserved
  9. (C) Datadog, Inc. 2010-2014 all rights reserved
  10. '''
  11. # set up logging before importing any other components
  12. from config import get_version, initialize_logging # noqa
  13. initialize_logging('collector')
  14. # stdlib
  15. import logging
  16. import os
  17. import signal
  18. import sys
  19. import time
  20. # For pickle & PID files, see issue 293
  21. os.umask(022)
  22. # project
  23. from checks.check_status import CollectorStatus
  24. from checks.collector import Collector
  25. from config import (
  26. get_config,
  27. get_parsed_args,
  28. get_system_stats,
  29. load_check_directory,
  30. )
  31. from daemon import AgentSupervisor, Daemon
  32. from emitter import http_emitter
  33. from util import (
  34. EC2,
  35. get_hostname,
  36. Watchdog,
  37. )
  38. from utils.flare import configcheck, Flare
  39. from utils.jmx import jmx_command
  40. from utils.pidfile import PidFile
  41. from utils.profile import AgentProfiler
  42. # Constants
  43. PID_NAME = "dd-agent"
  44. WATCHDOG_MULTIPLIER = 10
  45. RESTART_INTERVAL = 4 * 24 * 60 * 60 # Defaults to 4 days
  46. START_COMMANDS = ['start', 'restart', 'foreground']
  47. DD_AGENT_COMMANDS = ['check', 'flare', 'jmx']
  48. DEFAULT_COLLECTOR_PROFILE_INTERVAL = 20
  49. # Globals
  50. log = logging.getLogger('collector')
  51. class Agent(Daemon):
  52. """
  53. The agent class is a daemon that runs the collector in a background process.
  54. """
  55. def __init__(self, pidfile, autorestart, start_event=True, in_developer_mode=False):
  56. Daemon.__init__(self, pidfile, autorestart=autorestart)
  57. self.run_forever = True
  58. self.collector = None
  59. self.start_event = start_event
  60. self.in_developer_mode = in_developer_mode
  61. self._agentConfig = {}
  62. self._checksd = []
  63. self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL
  64. self.check_frequency = None
  65. self.configs_reloaded = False
  66. def _handle_sigterm(self, signum, frame):
  67. """Handles SIGTERM and SIGINT, which gracefully stops the agent."""
  68. log.debug("Caught sigterm. Stopping run loop.")
  69. self.run_forever = False
  70. if self.collector:
  71. self.collector.stop()
  72. log.debug("Collector is stopped.")
  73. def _handle_sigusr1(self, signum, frame):
  74. """Handles SIGUSR1, which signals an exit with an autorestart."""
  75. self._handle_sigterm(signum, frame)
  76. self._do_restart()
  77. def _handle_sighup(self, signum, frame):
  78. """Handles SIGHUP, which signals a configuration reload."""
  79. log.info("SIGHUP caught!")
  80. self.reload_configs()
  81. self.configs_reloaded = True
  82. def reload_configs(self):
  83. """Reloads the agent configuration and checksd configurations."""
  84. log.info("Attempting a configuration reload...")
  85. # Reload checksd configs
  86. hostname = get_hostname(self._agentConfig)
  87. self._checksd = load_check_directory(self._agentConfig, hostname)
  88. # Logging
  89. num_checks = len(self._checksd['initialized_checks'])
  90. if num_checks > 0:
  91. log.info("Successfully reloaded {num_checks} checks".
  92. format(num_checks=num_checks))
  93. else:
  94. log.info("No checksd configs found")
  95. @classmethod
  96. def info(cls, verbose=None):
  97. logging.getLogger().setLevel(logging.ERROR)
  98. return CollectorStatus.print_latest_status(verbose=verbose)
  99. def run(self, config=None):
  100. """Main loop of the collector"""
  101. # Gracefully exit on sigterm.
  102. signal.signal(signal.SIGTERM, self._handle_sigterm)
  103. # A SIGUSR1 signals an exit with an autorestart
  104. signal.signal(signal.SIGUSR1, self._handle_sigusr1)
  105. # Handle Keyboard Interrupt
  106. signal.signal(signal.SIGINT, self._handle_sigterm)
  107. # A SIGHUP signals a configuration reload
  108. signal.signal(signal.SIGHUP, self._handle_sighup)
  109. # Save the agent start-up stats.
  110. CollectorStatus().persist()
  111. # Intialize the collector.
  112. if not config:
  113. config = get_config(parse_args=True)
  114. self._agentConfig = self._set_agent_config_hostname(config)
  115. hostname = get_hostname(self._agentConfig)
  116. systemStats = get_system_stats()
  117. emitters = self._get_emitters()
  118. # Load the checks.d checks
  119. self._checksd = load_check_directory(self._agentConfig, hostname)
  120. # Initialize the Collector
  121. self.collector = Collector(self._agentConfig, emitters, systemStats, hostname)
  122. # In developer mode, the number of runs to be included in a single collector profile
  123. self.collector_profile_interval = self._agentConfig.get('collector_profile_interval',
  124. DEFAULT_COLLECTOR_PROFILE_INTERVAL)
  125. # Configure the watchdog.
  126. self.check_frequency = int(self._agentConfig['check_freq'])
  127. watchdog = self._get_watchdog(self.check_frequency)
  128. # Initialize the auto-restarter
  129. self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL))
  130. self.agent_start = time.time()
  131. profiled = False
  132. collector_profiled_runs = 0
  133. # Run the main loop.
  134. while self.run_forever:
  135. log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks'])))
  136. # Setup profiling if necessary
  137. if self.in_developer_mode and not profiled:
  138. try:
  139. profiler = AgentProfiler()
  140. profiler.enable_profiling()
  141. profiled = True
  142. except Exception as e:
  143. log.warn("Cannot enable profiler: %s" % str(e))
  144. # Do the work.
  145. self.collector.run(checksd=self._checksd,
  146. start_event=self.start_event,
  147. configs_reloaded=self.configs_reloaded)
  148. if self.configs_reloaded:
  149. self.configs_reloaded = False
  150. if profiled:
  151. if collector_profiled_runs >= self.collector_profile_interval:
  152. try:
  153. profiler.disable_profiling()
  154. profiled = False
  155. collector_profiled_runs = 0
  156. except Exception as e:
  157. log.warn("Cannot disable profiler: %s" % str(e))
  158. # Check if we should restart.
  159. if self.autorestart and self._should_restart():
  160. self._do_restart()
  161. # Only plan for next loop if we will continue, otherwise exit quickly.
  162. if self.run_forever:
  163. if watchdog:
  164. watchdog.reset()
  165. if profiled:
  166. collector_profiled_runs += 1
  167. log.debug("Sleeping for {0} seconds".format(self.check_frequency))
  168. time.sleep(self.check_frequency)
  169. # Now clean-up.
  170. try:
  171. CollectorStatus.remove_latest_status()
  172. except Exception:
  173. pass
  174. # Explicitly kill the process, because it might be running as a daemon.
  175. log.info("Exiting. Bye bye.")
  176. sys.exit(0)
  177. def _get_emitters(self):
  178. return [http_emitter]
  179. def _get_watchdog(self, check_freq):
  180. watchdog = None
  181. if self._agentConfig.get("watchdog", True):
  182. watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER,
  183. max_mem_mb=self._agentConfig.get('limit_memory_consumption', None))
  184. watchdog.reset()
  185. return watchdog
  186. def _set_agent_config_hostname(self, agentConfig):
  187. # Try to fetch instance Id from EC2 if not hostname has been set
  188. # in the config file.
  189. # DEPRECATED
  190. if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'):
  191. instanceId = EC2.get_instance_id(agentConfig)
  192. if instanceId is not None:
  193. log.info("Running on EC2, instanceId: %s" % instanceId)
  194. agentConfig['hostname'] = instanceId
  195. else:
  196. log.info('Not running on EC2, using hostname to identify this server')
  197. return agentConfig
  198. def _should_restart(self):
  199. if time.time() - self.agent_start > self.restart_interval:
  200. return True
  201. return False
  202. def _do_restart(self):
  203. log.info("Running an auto-restart.")
  204. if self.collector:
  205. self.collector.stop()
  206. sys.exit(AgentSupervisor.RESTART_EXIT_STATUS)
  207. def main():
  208. options, args = get_parsed_args()
  209. agentConfig = get_config(options=options)
  210. autorestart = agentConfig.get('autorestart', False)
  211. hostname = get_hostname(agentConfig)
  212. in_developer_mode = agentConfig.get('developer_mode')
  213. COMMANDS_AGENT = [
  214. 'start',
  215. 'stop',
  216. 'restart',
  217. 'status',
  218. 'foreground',
  219. ]
  220. COMMANDS_NO_AGENT = [
  221. 'info',
  222. 'check',
  223. 'configcheck',
  224. 'jmx',
  225. 'flare',
  226. ]
  227. COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT
  228. if len(args) < 1:
  229. sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS)))
  230. return 2
  231. command = args[0]
  232. if command not in COMMANDS:
  233. sys.stderr.write("Unknown command: %s\n" % command)
  234. return 3
  235. # Deprecation notice
  236. if command not in DD_AGENT_COMMANDS:
  237. # Will become an error message and exit after deprecation period
  238. from utils.deprecations import deprecate_old_command_line_tools
  239. deprecate_old_command_line_tools()
  240. if command in COMMANDS_AGENT:
  241. agent = Agent(PidFile('dd-agent').get_path(), autorestart, in_developer_mode=in_developer_mode)
  242. if command in START_COMMANDS:
  243. log.info('Agent version %s' % get_version())
  244. if 'start' == command:
  245. log.info('Start daemon')
  246. agent.start()
  247. elif 'stop' == command:
  248. log.info('Stop daemon')
  249. agent.stop()
  250. elif 'restart' == command:
  251. log.info('Restart daemon')
  252. agent.restart()
  253. elif 'status' == command:
  254. agent.status()
  255. elif 'info' == command:
  256. return Agent.info(verbose=options.verbose)
  257. elif 'foreground' == command:
  258. logging.info('Running in foreground')
  259. if autorestart:
  260. # Set-up the supervisor callbacks and fork it.
  261. logging.info('Running Agent with auto-restart ON')
  262. def child_func():
  263. agent.start(foreground=True)
  264. def parent_func():
  265. agent.start_event = False
  266. AgentSupervisor.start(parent_func, child_func)
  267. else:
  268. # Run in the standard foreground.
  269. agent.start(foreground=True)
  270. elif 'check' == command:
  271. if len(args) < 2:
  272. sys.stderr.write(
  273. "Usage: %s check <check_name> [check_rate]\n"
  274. "Add check_rate as last argument to compute rates\n"
  275. % sys.argv[0]
  276. )
  277. return 1
  278. check_name = args[1]
  279. try:
  280. import checks.collector
  281. # Try the old-style check first
  282. print getattr(checks.collector, check_name)(log).check(agentConfig)
  283. except Exception:
  284. # If not an old-style check, try checks.d
  285. checks = load_check_directory(agentConfig, hostname)
  286. for check in checks['initialized_checks']:
  287. if check.name == check_name:
  288. if in_developer_mode:
  289. check.run = AgentProfiler.wrap_profiling(check.run)
  290. cs = Collector.run_single_check(check, verbose=True)
  291. print CollectorStatus.render_check_status(cs)
  292. if len(args) == 3 and args[2] == 'check_rate':
  293. print "Running 2nd iteration to capture rate metrics"
  294. time.sleep(1)
  295. cs = Collector.run_single_check(check, verbose=True)
  296. print CollectorStatus.render_check_status(cs)
  297. check.stop()
  298. elif 'configcheck' == command or 'configtest' == command:
  299. configcheck()
  300. elif 'jmx' == command:
  301. jmx_command(args[1:], agentConfig)
  302. elif 'flare' == command:
  303. Flare.check_user_rights()
  304. case_id = int(args[1]) if len(args) > 1 else None
  305. f = Flare(True, case_id)
  306. f.collect()
  307. try:
  308. f.upload()
  309. except Exception, e:
  310. print 'The upload failed:\n{0}'.format(str(e))
  311. return 0
  312. if __name__ == '__main__':
  313. try:
  314. sys.exit(main())
  315. except StandardError:
  316. # Try our best to log the error.
  317. try:
  318. log.exception("Uncaught error running the Agent")
  319. except Exception:
  320. pass
  321. raise