PageRenderTime 42ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/r2/supervise_watcher.py

https://github.com/wangmxf/lesswrong
Python | 310 lines | 254 code | 31 blank | 25 comment | 33 complexity | 1e2e34e268b15624ef8ad58de057b95f MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1
  1. # The contents of this file are subject to the Common Public Attribution
  2. # License Version 1.0. (the "License"); you may not use this file except in
  3. # compliance with the License. You may obtain a copy of the License at
  4. # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
  5. # License Version 1.1, but Sections 14 and 15 have been added to cover use of
  6. # software over a computer network and provide for limited attribution for the
  7. # Original Developer. In addition, Exhibit A has been modified to be consistent
  8. # with Exhibit B.
  9. #
  10. # Software distributed under the License is distributed on an "AS IS" basis,
  11. # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
  12. # the specific language governing rights and limitations under the License.
  13. #
  14. # The Original Code is Reddit.
  15. #
  16. # The Original Developer is the Initial Developer. The Initial Developer of the
  17. # Original Code is CondeNet, Inc.
  18. #
  19. # All portions of the code written by CondeNet are Copyright (c) 2006-2008
  20. # CondeNet, Inc. All Rights Reserved.
  21. ################################################################################
  22. #!/usr/bin/env python
  23. from pylons import g
  24. import os, re, sys
  25. from datetime import datetime, timedelta
  26. cache = g.cache
  27. host = g.reddit_host
  28. default_services = ['newreddit']
  29. default_servers = g.monitored_servers
  30. class Service:
  31. maxlen = 300
  32. __slots__ = ['host', 'name', 'pid', 'load']
  33. def __init__(self, name, pid, age, time):
  34. self.host = host
  35. self.name = name
  36. self.pid = pid
  37. self.age = age
  38. self.time = time
  39. self._cpu = []
  40. self.load = 0
  41. self.mem = 0
  42. self.age = 0
  43. def __iter__(self):
  44. for x in self.__slots__:
  45. yield (x, getattr(self, x))
  46. yield ('last_seen', (datetime.now() - self.time).seconds)
  47. for t in (0, 5, 60, 300):
  48. yield ('cpu_%d' % t, self.cpu(t))
  49. yield ('mem', self.mem)
  50. yield ('age', self.age)
  51. def __str__(self):
  52. return ("%(host)s\t%(cpu_0)5.2f%%\t%(cpu_5)5.2f%%\t%(cpu_60)5.2f%%\t%(cpu_300)5.2f%%" +
  53. "\t%(pid)s\t%(name)s\t(%(last_seen)s seconds)") % dict(self)
  54. def track_cpu(self, usage):
  55. self.time = datetime.now()
  56. self._cpu.append((usage, self.time))
  57. if len(self._cpu) > self.maxlen:
  58. self._cpu = self._cpu[-self.maxlen:]
  59. def track_mem(self, usage):
  60. self.mem = usage
  61. def track_age(self, usage):
  62. self.age = usage
  63. def cpu(self, interval = 60):
  64. time = datetime.now()
  65. if interval > 0:
  66. cpu = filter(lambda x: time - x[1] <= timedelta(0, interval), self._cpu)
  67. elif self._cpu:
  68. cpu = [self._cpu[-1]]
  69. else:
  70. cpu = []
  71. return sum(c[0] for c in cpu)/max(len(cpu), 1)
  72. class Services:
  73. cache_key = "supervise_services_"
  74. def __init__(self, _host = host):
  75. self.last_update = None
  76. self._services = {}
  77. self._host = _host
  78. self.load = 0.
  79. def track(self, pid, cpu, mem, age):
  80. try:
  81. if isinstance(pid, str):
  82. pid = int(pid)
  83. if self._services.has_key(pid):
  84. self._services[pid].track_cpu(cpu)
  85. self._services[pid].track_mem(mem)
  86. self._services[pid].track_age(age)
  87. except ValueError:
  88. pass
  89. def add(self, name, pid, age):
  90. self.last_update = datetime.now()
  91. if not self._services.has_key(pid):
  92. self._services[pid] = Service(name, pid, age, self.last_update)
  93. else:
  94. self._services[pid].time = self.last_update
  95. self._services[pid].age = age
  96. def __iter__(self):
  97. return self._services.itervalues()
  98. def get_cache(self):
  99. key = self.cache_key + str(self._host)
  100. res = cache.get(key)
  101. if isinstance(res, dict):
  102. services = res.get("services", [])
  103. self.load = res.get("load", 0)
  104. else:
  105. services = res
  106. self.load = services[0].get("load", 0) if services else 0
  107. return services
  108. def set_cache(self):
  109. key = self.cache_key + str(self._host)
  110. svs = [dict(s) for s in self]
  111. cache.set(key, dict(load = self.load,
  112. services = svs,
  113. host = self._host))
  114. def clean_dead(self, age = 30):
  115. time = datetime.now()
  116. active = filter(lambda s: time - self._services[s].time <= timedelta(0, age),
  117. self._services.keys())
  118. existing = self._services.keys()
  119. for pid in existing:
  120. if pid not in active:
  121. del self._services[pid]
  122. from r2.config.templates import tpm
  123. from r2.lib.wrapped import Wrapped
  124. tpm.add('service_page', 'html', file = "server_status_page.html")
  125. tpm.add('service_page', 'htmllite', file = "server_status_page.htmllite")
  126. class Service_Page(Wrapped):
  127. def __init__(self, machines = default_servers):
  128. self.services = [Services(m) for m in machines]
  129. def __repr__(self):
  130. return "service page"
  131. def Alert(restart_list=['MEM','CPU']):
  132. import time
  133. import smtplib
  134. import re
  135. p=re.compile("/service/newreddit(\d+)\:")
  136. cache_key = 'already_alerted_'
  137. alert_recipients = ['nerds@reddit.com']
  138. alert_sender = 'nerds@reddit.com'
  139. smtpserver = 'nt03.wireddit.com'
  140. for m in default_servers:
  141. s = Services(m)
  142. services = s.get_cache() or []
  143. services.sort(lambda x, y: 1 if x['name'] > y['name'] else -1)
  144. for service in services:
  145. output = "\nCPU: "
  146. #output += (str(service['host']) + " " + str(service['name']))
  147. pegged_count = 0
  148. need_restart = False
  149. # Check for pegged procs
  150. for x in (0, 5, 60, 300):
  151. val = service['cpu_' + str(x)]
  152. if val > 99:
  153. pegged_count += 1
  154. output += " %6.2f%%" % val
  155. service_name = str(service['host']) + " " + str(service['name'])
  156. if (pegged_count > 3):
  157. if 'CPU' in restart_list:
  158. need_restart = True
  159. # Check for out of memory situation
  160. output += "\nMEMORY: %6.2f%%" % service.get('mem', 0)
  161. mem_pegged = (service.get('mem', 0) > 10)
  162. if (mem_pegged):
  163. if 'MEM' in restart_list:
  164. need_restart = True
  165. if (need_restart):
  166. mesg = ("To: nerds@gmail.com\n" +
  167. "Subject: " + service_name.replace("/service/","")
  168. +" needs attention\n\n"
  169. + service_name.replace("/service/","")
  170. + (" is out of mem: " if mem_pegged else " is pegged:" )
  171. + output)
  172. m = p.match(service['name'])
  173. # If we can restart this process, we do it here
  174. if m:
  175. proc_number = str(m.groups()[0])
  176. cmd = "/usr/local/bin/push -h " + \
  177. service['host'] + " -r " + proc_number
  178. result = ""
  179. result = os.popen3(cmd)[2].read()
  180. # We override the other message to show we restarted it
  181. mesg = "To: nerds@gmail.com\n" + "Subject: " + "Process " + \
  182. proc_number + " on " + service['host'] + \
  183. " was automatically restarted due to the following:\n\n" + \
  184. output + "\n\n" + \
  185. "Here was the output:\n" + result
  186. # Uncomment this to disable restart messages
  187. #mesg = ""
  188. last_alerted = cache.get(cache_key + service_name) or 0
  189. #last_alerted = 0
  190. if (time.time() - last_alerted < 300):
  191. pass
  192. else:
  193. cache.set(cache_key + service_name, time.time())
  194. if mesg is not "":
  195. session = smtplib.SMTP(smtpserver)
  196. smtpresult = session.sendmail(alert_sender,
  197. alert_recipients, mesg)
  198. session.quit()
  199. #print mesg
  200. #print "Email sent"
  201. def Write(file = None, servers = default_servers):
  202. if file:
  203. handle = open(file, "w")
  204. else:
  205. handle = sys.stdout
  206. handle.write(Service_Page(servers).render())
  207. if file:
  208. handle.close()
  209. def Run(srvname=None, loop = True, loop_time = 2):
  210. services = Services()
  211. pidi = 0
  212. cpuid = 8
  213. memid = 9
  214. ageid = 10
  215. procid = 11
  216. text = re.compile('\S+')
  217. from time import sleep
  218. counter = 0
  219. while True:
  220. # reload the processes
  221. if counter % 10 == 0:
  222. handle = os.popen("/usr/local/bin/svstat /service/*")
  223. for line in handle:
  224. try:
  225. name, status, blah, pid, time, label = line.split(' ')
  226. pid = int(pid.strip(')'))
  227. if not srvname or any(s in name for s in srvname):
  228. services.add(name, pid, time)
  229. except ValueError:
  230. pass
  231. services.clean_dead()
  232. handle.close()
  233. counter +=1
  234. cmd = ('/usr/bin/top -b -n 1 ' +
  235. ' '.join("-p%d"%x.pid for x in services))
  236. handle = os.popen(cmd)
  237. for line in handle:
  238. line = text.findall(line)
  239. try:
  240. services.track(line[pidi], float(line[cpuid]),
  241. float(line[memid]),
  242. float(line[ageid].split(':')[0]))
  243. except (ValueError, IndexError):
  244. pass
  245. handle.close()
  246. handle = os.popen('/usr/bin/uptime')
  247. foo = handle.read()
  248. services.load=float(foo.split("average:")[1].strip(' ').split(',')[0])
  249. handle.close()
  250. res = ''
  251. services.set_cache()
  252. if loop:
  253. sleep(loop_time)
  254. else:
  255. break
  256. def Test(num, load = 1., pid = 0):
  257. services = Services()
  258. for i in xrange(num):
  259. name = 'testproc' + str(i)
  260. p = i or pid
  261. services.add(name, p, "10")
  262. services.track(p, 100. * (i+1) / (num),
  263. 20. * (i+1) / num, 1.)
  264. services.load = load
  265. services.set_cache()
  266. if __name__ == '__main__':
  267. Run(sys.argv[1:] if sys.argv[1:] else default_services)