/mysql_watcher/dblibs/dbmonitor.py

https://bitbucket.org/lindenlab/apiary/ · Python · 496 lines · 463 code · 4 blank · 29 comment · 5 complexity · d37f656c5f7764cdf995c74df75a0b8e MD5 · raw file

  1. #!/usr/bin/env python
  2. #
  3. # $LicenseInfo:firstyear=2007&license=mit$
  4. #
  5. # Copyright (c) 2007-2010, Linden Research, Inc.
  6. #
  7. # Permission is hereby granted, free of charge, to any person obtaining a copy
  8. # of this software and associated documentation files (the "Software"), to deal
  9. # in the Software without restriction, including without limitation the rights
  10. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11. # copies of the Software, and to permit persons to whom the Software is
  12. # furnished to do so, subject to the following conditions:
  13. #
  14. # The above copyright notice and this permission notice shall be included in
  15. # all copies or substantial portions of the Software.
  16. #
  17. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23. # THE SOFTWARE.
  24. # $/LicenseInfo$
  25. #
  26. #
  27. # Utility classes that allow us to monitor and keep track of databases
  28. #
  29. import copy
  30. import MySQLdb
  31. from dbutil import *
  32. MAX_SLAVE_BEHIND = 2 # Maximum number of seconds a slave can be behind
  33. MAX_SLAVE_RUNNING = 4 # Maximum number of running processes allowed on the slave
  34. MAX_SLAVE_AVG_AGE = 2 # Maximum average for all running queries
  35. def safe_delete(host, delete_sql, num_rows = 0, verbose = False, stop_time = 0, paranoid = False, batch = True, ignored_slaves = ()):
  36. """Safely do a bulk delete using the delete string on the specified database host
  37. Returns the number of rows deleted, or -1 if there was an error"""
  38. # Validate the incoming statement
  39. if re.compile(".*\s+limit\s+\d+").match(delete_sql.lower()):
  40. print "Query '%s' appears to have a LIMIT clause, aborting" % delete_sql
  41. return -1
  42. dbm = DBMonitor(host)
  43. dbm.mSafeParanoidSlaves = paranoid
  44. dbm.mIgnoredSlaves = ignored_slaves
  45. deleted = 0
  46. MIN_ROWS = 500 # Minimum number of rows to delete at once
  47. MAX_ELAPSED = 15.0 # Elapsed time to target for a single delete
  48. # Dynamically adapt these variables to ensure optimum delete rates
  49. limit_rows = MIN_ROWS
  50. sleep_time = 1.0
  51. while 1:
  52. # Exit or continue based on the stop time
  53. if stop_time and (stop_time < time.time()):
  54. # Always print this
  55. if verbose:
  56. print "Time has run out, %d deletes completed!" % deleted
  57. return ("Timeout", deleted)
  58. safe = dbm.safeToRunQuery(verbose)
  59. if verbose:
  60. print time.ctime()
  61. dbm.dump()
  62. sleep_time = 1.0 # Sleep for 1 second between attempts, by default
  63. # Run the query if safe
  64. if safe:
  65. db = dbm.connect(True)
  66. if db:
  67. cursor = db.cursor()
  68. if batch:
  69. full_query = delete_sql + " LIMIT %d" % limit_rows
  70. else:
  71. full_query = delete_sql
  72. if verbose:
  73. print "Executing %s" % full_query
  74. sys.stdout.flush()
  75. begin_time = time.time()
  76. cursor.execute(full_query)
  77. end_time = time.time()
  78. elapsed = end_time - begin_time
  79. # Exit or continue based on number of rows deleted
  80. if cursor.rowcount < 0:
  81. return ("Error", deleted)
  82. else:
  83. # Increment the number of deleted rows
  84. deleted += cursor.rowcount
  85. if cursor.rowcount < limit_rows:
  86. # We've deleted everything, as otherwise we would have deleted limit_rows
  87. return ("OK", deleted)
  88. if num_rows and (deleted >= num_rows):
  89. # If we've specified a number of rows to try to delete and we've hit it, exit
  90. return ("OK", deleted)
  91. # Adapt the number of rows to delete based on execution time
  92. limit_rows = limit_rows*(MAX_ELAPSED/elapsed)
  93. limit_rows = int((limit_rows//100)*100)
  94. limit_rows = max(MIN_ROWS, limit_rows)
  95. # Clamp us to delete exactly the specified number of rows if we have num_rows
  96. if num_rows:
  97. limit_rows = min(num_rows - deleted, limit_rows)
  98. # Sleep for as long as the delete took before trying again, this should
  99. # give us a reasonable sleep time
  100. sleep_time = min(2*MAX_SLAVE_BEHIND, max(0.25, elapsed))
  101. if verbose:
  102. print time.ctime()
  103. print "Total deleted: %d" % deleted
  104. print "Elapsed time: %s" % elapsed
  105. cursor.close()
  106. if verbose:
  107. print
  108. sys.stdout.flush()
  109. time.sleep(sleep_time)
  110. def log_delete(host, query, end_time, table, paranoid = True, verbose = False, batch = True):
  111. """A convenience function for cleanups inside cron jobs"""
  112. print time.ctime(), "Deleting from %s" % table
  113. (status, count) = safe_delete(host, query, verbose = verbose, stop_time = end_time, paranoid = paranoid, batch = batch)
  114. if "Error" == status:
  115. print >> sys.stderr, time.ctime(), "%s: %s failed" % (table, query)
  116. return True
  117. elif "Timeout" == status:
  118. print >> sys.stderr, time.ctime(), "%s: Timeout, %d rows deleted" % (table, count)
  119. return True
  120. else:
  121. print time.ctime(), "%s: %d rows deleted" % (table, count)
  122. return False
  123. class DBMonitor:
  124. """Monitor a DB and all of its slaves for health"""
  125. def __init__(self, host, user, password):
  126. # Generic database information
  127. self.mHost = host
  128. self.mUser = user
  129. self.mPassword = password
  130. self.mAlive = False
  131. self.mNumProcesses = 0
  132. self.mNumRunning = 0
  133. self.mNumSleeping = 0
  134. self.mAvgAge = 0.0
  135. self.mIsSlave = False
  136. self.mDB = None
  137. self.mProcessRows = None # Contains the results of the last process row queries
  138. # Master specific info
  139. self.mSlaves = None # A list of all slaves
  140. # Slave specific info
  141. self.mMaster = ""
  142. self.mSlaveRunning = False
  143. self.mSlaveBehind = 0
  144. self.mIgnoredSlaves = () # A list of slaves to ignore
  145. # Database "health" options
  146. self.mSafeParanoidSlaves = True # All slaves have to be healthy
  147. self.mSafeMaxProcesses = 5 # Max processes to allow to run on this host when safe
  148. self.mSafeMaxAvgAge = 300 # If multiple queries, maximum allowed average age
  149. def connect(self, force_reconnect = False):
  150. """Connect to the database - flush the existing connection if we're already connected."""
  151. if self.mDB and force_reconnect:
  152. self.mDB.close()
  153. self.mDB = None
  154. if not self.mDB:
  155. try:
  156. self.mDB = MySQLdb.connect(self.mHost,
  157. self.mUser,
  158. self.mPassword)
  159. except:
  160. print "Failed to connect to database on %s" % self.mHost
  161. self.mDB = None
  162. self.mAlive = False
  163. return None
  164. self.mDB.autocommit(True)
  165. self.mAlive = True
  166. return self.mDB
  167. def disconnect(self):
  168. self.mDB.close()
  169. self.mDB = None
  170. def killProcess(self, pid):
  171. self.connect(force_reconnect=False)
  172. if self.mDB:
  173. cursor = self.mDB.cursor()
  174. try:
  175. cursor.execute("kill %d" % pid)
  176. except:
  177. pass
  178. cursor.close()
  179. else:
  180. print "Couldn't get database"
  181. def getProcessList(self):
  182. """Return all of the running processes on the DB in a sequence of row maps."""
  183. # Flush existing statistics
  184. self.mNumProcesses = 0
  185. self.mNumRunning = 0
  186. self.mNumSleeping = 0
  187. # Connect to the DB
  188. self.connect()
  189. if not self.isAlive():
  190. return ()
  191. cursor = self.mDB.cursor()
  192. cursor.execute("show full processlist")
  193. all = all_as_maps(cursor)
  194. cursor.close()
  195. # Generate statistics
  196. self.mNumProcesses = len(all)
  197. running_ages = 0.0
  198. self.mProcessRows = all
  199. # Accumulate statistics for all processes
  200. for row in all:
  201. # Ignore backups
  202. if (row['Command'] == 'Query') and (row['User'] != 'backup'):
  203. self.mNumRunning += 1
  204. running_ages += row['Time']
  205. elif row['Command'] == 'Sleep':
  206. self.mNumSleeping += 1
  207. self.mAvgAge = running_ages/self.mNumRunning
  208. return all
  209. def explain(self, database, query):
  210. self.connect()
  211. cursor = self.mDB.cursor()
  212. if database:
  213. cursor.execute("use %s" % database)
  214. query = query.replace('\n',' ')
  215. query = re.sub('\s+', ' ', query)
  216. explain_str = "explain %s" % query
  217. out = {}
  218. try:
  219. cursor.execute(explain_str)
  220. except:
  221. #print "Exception in explain for db: %s query:%s" % (database, query)
  222. return None
  223. all = all_as_maps(cursor)
  224. # Reorganize join info by query
  225. out['explain_str'] = explain_str
  226. out['tables'] = {}
  227. explain_tables = out['tables']
  228. out['analysis'] = {}
  229. out['raw_explain'] = copy.deepcopy(all)
  230. for row in all:
  231. table = row['table']
  232. if not table:
  233. table = ''
  234. #if table in explain_tables:
  235. # continue
  236. explain_tables[table] = copy.deepcopy(row)
  237. for key in row.keys():
  238. if None == key:
  239. raise "Bad row:", row
  240. if not row['possible_keys']:
  241. if not 'no_key' in out['analysis']:
  242. out['analysis']['no_key'] = []
  243. out['analysis']['no_key'].append(table)
  244. return out
  245. def safeToRunQuery(self, verbose = False):
  246. """Return if it's safe to run an expensive query"""
  247. # Get updates on the master
  248. self.getProcessList()
  249. # Get updates on all of the slaves
  250. if not self.mSlaves:
  251. self.getSlaves()
  252. self.updateSlaves()
  253. # Now, check everything we care about
  254. MAX_PROCESSES = 5 # Maximum number of running processes on the master
  255. safe = True
  256. if self.mNumRunning > self.mSafeMaxProcesses:
  257. safe = False
  258. if verbose:
  259. print "%s: Unsafe: %d running processes" % (self.mHost, self.mNumRunning)
  260. if self.mAvgAge > self.mSafeMaxAvgAge:
  261. safe = False
  262. if verbose:
  263. print "%s: Unsafe: %f average query age" % (self.mHost, self.mAvgAge)
  264. num_slaves = self.mSlaves
  265. # Check if slaves are OK, but only if we have some
  266. if num_slaves:
  267. healthy_slaves = 0
  268. for slave in self.mSlaves.values():
  269. if slave.isSlaveHealthy(verbose):
  270. healthy_slaves += 1
  271. elif slave.mHost in self.mIgnoredSlaves:
  272. # Pretend the slave is healthy
  273. healthy_slaves += 1
  274. if verbose:
  275. print "Ignoring unhealthy slave %s" % slave.mHost
  276. if ((self.mSafeParanoidSlaves and (healthy_slaves != len(self.mSlaves)))
  277. or (not healthy_slaves)):
  278. safe = False
  279. if verbose:
  280. print "Not enough healthy slaves (%d/%d)" % (healthy_slaves, len(self.mSlaves))
  281. if not safe and verbose:
  282. print "%s: Unsafe to run query!" % self.mHost
  283. elif verbose:
  284. print "%s: Safe to run query" % self.mHost
  285. return safe
  286. def masterStatusString(self):
  287. return "%s: Master: (%d/%d) AvgAge %.2f" % (self.mHost, self.mNumRunning, self.mNumProcesses, self.mAvgAge)
  288. def dump(self):
  289. """Dump useful and interesting information about this database and its slaves"""
  290. print self.masterStatusString()
  291. for slave in self.mSlaves.values():
  292. slave.dumpSlave()
  293. def isAlive(self):
  294. return self.mAlive
  295. #
  296. # Get all table information for a database
  297. #
  298. def getTableInfo(self):
  299. # Connect to the DB
  300. self.connect()
  301. if not self.isAlive():
  302. return ()
  303. cursor = self.mDB.cursor()
  304. cursor.execute("show databases")
  305. all_dbs = all_as_maps(cursor)
  306. dbs = {}
  307. # Get all the databases
  308. for db in all_dbs:
  309. dbs[db['Database']] = {}
  310. bad_tables = ['tmp', 'information_schema', 'mysql','secondopinion']
  311. # Iterate through all the databases and get table information.
  312. for db in dbs.keys():
  313. dbs[db]['tables'] = {}
  314. if db in bad_tables:
  315. continue
  316. print "Database:", db
  317. print "----------------------"
  318. cursor.execute("use %s" % db)
  319. cursor.execute("show table status")
  320. all_tables = all_as_maps(cursor)
  321. print "Name Rows Data Index"
  322. print "---------------------"
  323. for table in all_tables:
  324. dbs[db]['tables'][table['Name']] = copy.deepcopy(table)
  325. print table['Name'],table['Rows'], table['Data_length'], table['Index_length']
  326. print
  327. return dbs
  328. #
  329. # Get all tables and their fields for the sequence and indra DBs
  330. # as a nested dict
  331. def getTableFields(self):
  332. # Connect to the DB
  333. self.connect()
  334. if not self.isAlive():
  335. return ()
  336. cursor = self.mDB.cursor()
  337. dbs = {'sequence':{},'indra':{}}
  338. # Iterate through all the databases and get table information.
  339. for db in dbs.keys():
  340. cursor.execute("use %s" % db)
  341. cursor.execute("show table status")
  342. all_tables = all_as_maps(cursor)
  343. for table in all_tables:
  344. cursor.execute("desc %s" % table['Name'])
  345. dbs[db][table['Name']] = [row[0] for row in cursor.fetchall()]
  346. return dbs
  347. #
  348. # Slave management methods for master
  349. #
  350. def getSlaves(self):
  351. """Get a list of all of the slaves of this mysql host"""
  352. procs = self.getProcessList()
  353. self.clearSlaves()
  354. for row in procs:
  355. if row['Command'] == 'Binlog Dump':
  356. host, port = row['Host'].split(':')
  357. hostname = socket.gethostbyaddr(host)[0]
  358. slave = DBMonitor(hostname, self.mUser, self.mPassword)
  359. slave.mIsSlave = True
  360. self.mSlaves[hostname] = slave
  361. #print self.mSlaves
  362. def clearSlaves(self):
  363. """Cleanup all slave DBs"""
  364. self.mSlaves = {}
  365. def updateSlaves(self):
  366. """Update the status of all of the slave DBs"""
  367. for slave in self.mSlaves.values():
  368. slave.updateSlaveStatus()
  369. pass
  370. #
  371. # Slave management methods for slave
  372. #
  373. def updateSlaveStatus(self):
  374. """Get information about slave status"""
  375. # Flush existing data
  376. self.mMaster = ""
  377. self.mSlaveRunning = False
  378. self.mSlaveBehind = 0
  379. # Connect to the database
  380. self.connect()
  381. if not self.isAlive():
  382. return
  383. cursor = self.mDB.cursor()
  384. cursor.execute("show slave status")
  385. all = all_as_maps(cursor)
  386. cursor.close()
  387. # Pull data from result
  388. row = all[0]
  389. self.mMaster = row['Master_Host']
  390. self.mSlaveRunning = ('Yes' == row['Slave_SQL_Running']) and ('Yes' == row['Slave_IO_Running'])
  391. self.mSlaveBehind = row['Seconds_Behind_Master']
  392. if None == self.mSlaveBehind:
  393. self.mSlaveBehind = 99999
  394. # Update process list stats
  395. self.getProcessList()
  396. def isSlaveHealthy(self, verbose = False):
  397. healthy = True
  398. if not self.isAlive():
  399. healthy = False
  400. if verbose:
  401. print "%s: Unable to connect to database!" % self.mHost
  402. if not self.mSlaveRunning:
  403. healthy = False
  404. if verbose:
  405. print "%s: Slave is not running!" % self.mHost
  406. if self.mSlaveBehind > MAX_SLAVE_BEHIND:
  407. healthy = False
  408. if verbose:
  409. print "%s: Slave is %d seconds behind" % (self.mHost, self.mSlaveBehind)
  410. if self.mNumRunning > MAX_SLAVE_RUNNING:
  411. healthy = False
  412. if verbose:
  413. print "%s: Slave has %d running processes" % (self.mHost, self.mNumRunning)
  414. #if self.mAvgAge > MAX_SLAVE_AVG_AGE:
  415. # healthy = False
  416. # if verbose:
  417. # print "%s: Slave has %f average age" % (self.mHost, self.mAvgAge)
  418. return healthy
  419. def slaveStatusString(self):
  420. return "%s: Slave: Run: %s\tBehind: %d\tRun Procs: %d\tAvgAge: %.2f" \
  421. % (self.mHost, str(self.mSlaveRunning), self.mSlaveBehind, self.mNumRunning, self.mAvgAge)
  422. def dumpSlave(self):
  423. """Dump slave-specific info"""
  424. print self.slaveStatusString()
  425. def main():
  426. """Simple test stub which dumps how happy a particular database is."""
  427. db_host = sys.argv[1]
  428. dbm = DBMonitor(db_host)
  429. while 1:
  430. dbm.safeToRunQuery(True)
  431. dbm.dump()
  432. print
  433. time.sleep(1.0)
  434. if __name__ == "__main__":
  435. main()