/mysql_watcher/dblibs/dbmonitor.py
Python | 496 lines | 428 code | 20 blank | 48 comment | 19 complexity | d37f656c5f7764cdf995c74df75a0b8e MD5 | raw file
1#!/usr/bin/env python 2# 3# $LicenseInfo:firstyear=2007&license=mit$ 4# 5# Copyright (c) 2007-2010, Linden Research, Inc. 6# 7# Permission is hereby granted, free of charge, to any person obtaining a copy 8# of this software and associated documentation files (the "Software"), to deal 9# in the Software without restriction, including without limitation the rights 10# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11# copies of the Software, and to permit persons to whom the Software is 12# furnished to do so, subject to the following conditions: 13# 14# The above copyright notice and this permission notice shall be included in 15# all copies or substantial portions of the Software. 16# 17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23# THE SOFTWARE. 24# $/LicenseInfo$ 25# 26 27# 28# Utility classes that allow us to monitor and keep track of databases 29# 30import copy 31import MySQLdb 32from dbutil import * 33 34MAX_SLAVE_BEHIND = 2 # Maximum number of seconds a slave can be behind 35MAX_SLAVE_RUNNING = 4 # Maximum number of running processes allowed on the slave 36MAX_SLAVE_AVG_AGE = 2 # Maximum average for all running queries 37 38 39def safe_delete(host, delete_sql, num_rows = 0, verbose = False, stop_time = 0, paranoid = False, batch = True, ignored_slaves = ()): 40 """Safely do a bulk delete using the delete string on the specified database host 41 Returns the number of rows deleted, or -1 if there was an error""" 42 43 # Validate the incoming statement 44 if re.compile(".*\s+limit\s+\d+").match(delete_sql.lower()): 45 print "Query '%s' appears to have a LIMIT clause, aborting" % delete_sql 46 return -1 47 48 dbm = DBMonitor(host) 49 dbm.mSafeParanoidSlaves = paranoid 50 dbm.mIgnoredSlaves = ignored_slaves 51 deleted = 0 52 53 MIN_ROWS = 500 # Minimum number of rows to delete at once 54 MAX_ELAPSED = 15.0 # Elapsed time to target for a single delete 55 56 # Dynamically adapt these variables to ensure optimum delete rates 57 limit_rows = MIN_ROWS 58 sleep_time = 1.0 59 while 1: 60 # Exit or continue based on the stop time 61 if stop_time and (stop_time < time.time()): 62 # Always print this 63 if verbose: 64 print "Time has run out, %d deletes completed!" % deleted 65 return ("Timeout", deleted) 66 67 safe = dbm.safeToRunQuery(verbose) 68 if verbose: 69 print time.ctime() 70 dbm.dump() 71 sleep_time = 1.0 # Sleep for 1 second between attempts, by default 72 73 # Run the query if safe 74 if safe: 75 db = dbm.connect(True) 76 if db: 77 cursor = db.cursor() 78 if batch: 79 full_query = delete_sql + " LIMIT %d" % limit_rows 80 else: 81 full_query = delete_sql 82 if verbose: 83 print "Executing %s" % full_query 84 sys.stdout.flush() 85 begin_time = time.time() 86 cursor.execute(full_query) 87 end_time = time.time() 88 elapsed = end_time - begin_time 89 90 # Exit or continue based on number of rows deleted 91 if cursor.rowcount < 0: 92 return ("Error", deleted) 93 else: 94 # Increment the number of deleted rows 95 deleted += cursor.rowcount 96 97 if cursor.rowcount < limit_rows: 98 # We've deleted everything, as otherwise we would have deleted limit_rows 99 return ("OK", deleted) 100 101 if num_rows and (deleted >= num_rows): 102 # If we've specified a number of rows to try to delete and we've hit it, exit 103 return ("OK", deleted) 104 105 # Adapt the number of rows to delete based on execution time 106 limit_rows = limit_rows*(MAX_ELAPSED/elapsed) 107 limit_rows = int((limit_rows//100)*100) 108 limit_rows = max(MIN_ROWS, limit_rows) 109 110 # Clamp us to delete exactly the specified number of rows if we have num_rows 111 if num_rows: 112 limit_rows = min(num_rows - deleted, limit_rows) 113 114 # Sleep for as long as the delete took before trying again, this should 115 # give us a reasonable sleep time 116 sleep_time = min(2*MAX_SLAVE_BEHIND, max(0.25, elapsed)) 117 if verbose: 118 print time.ctime() 119 print "Total deleted: %d" % deleted 120 print "Elapsed time: %s" % elapsed 121 cursor.close() 122 if verbose: 123 print 124 sys.stdout.flush() 125 time.sleep(sleep_time) 126 127def log_delete(host, query, end_time, table, paranoid = True, verbose = False, batch = True): 128 """A convenience function for cleanups inside cron jobs""" 129 print time.ctime(), "Deleting from %s" % table 130 (status, count) = safe_delete(host, query, verbose = verbose, stop_time = end_time, paranoid = paranoid, batch = batch) 131 if "Error" == status: 132 print >> sys.stderr, time.ctime(), "%s: %s failed" % (table, query) 133 return True 134 elif "Timeout" == status: 135 print >> sys.stderr, time.ctime(), "%s: Timeout, %d rows deleted" % (table, count) 136 return True 137 else: 138 print time.ctime(), "%s: %d rows deleted" % (table, count) 139 return False 140 141class DBMonitor: 142 """Monitor a DB and all of its slaves for health""" 143 def __init__(self, host, user, password): 144 # Generic database information 145 self.mHost = host 146 self.mUser = user 147 self.mPassword = password 148 self.mAlive = False 149 self.mNumProcesses = 0 150 self.mNumRunning = 0 151 self.mNumSleeping = 0 152 self.mAvgAge = 0.0 153 self.mIsSlave = False 154 self.mDB = None 155 self.mProcessRows = None # Contains the results of the last process row queries 156 157 # Master specific info 158 self.mSlaves = None # A list of all slaves 159 160 # Slave specific info 161 self.mMaster = "" 162 self.mSlaveRunning = False 163 self.mSlaveBehind = 0 164 165 self.mIgnoredSlaves = () # A list of slaves to ignore 166 167 # Database "health" options 168 self.mSafeParanoidSlaves = True # All slaves have to be healthy 169 self.mSafeMaxProcesses = 5 # Max processes to allow to run on this host when safe 170 self.mSafeMaxAvgAge = 300 # If multiple queries, maximum allowed average age 171 172 def connect(self, force_reconnect = False): 173 """Connect to the database - flush the existing connection if we're already connected.""" 174 if self.mDB and force_reconnect: 175 self.mDB.close() 176 self.mDB = None 177 178 if not self.mDB: 179 try: 180 self.mDB = MySQLdb.connect(self.mHost, 181 self.mUser, 182 self.mPassword) 183 except: 184 print "Failed to connect to database on %s" % self.mHost 185 self.mDB = None 186 self.mAlive = False 187 return None 188 189 self.mDB.autocommit(True) 190 191 self.mAlive = True 192 return self.mDB 193 def disconnect(self): 194 self.mDB.close() 195 self.mDB = None 196 def killProcess(self, pid): 197 self.connect(force_reconnect=False) 198 if self.mDB: 199 cursor = self.mDB.cursor() 200 try: 201 cursor.execute("kill %d" % pid) 202 except: 203 pass 204 cursor.close() 205 else: 206 print "Couldn't get database" 207 208 def getProcessList(self): 209 """Return all of the running processes on the DB in a sequence of row maps.""" 210 211 # Flush existing statistics 212 self.mNumProcesses = 0 213 self.mNumRunning = 0 214 self.mNumSleeping = 0 215 216 # Connect to the DB 217 self.connect() 218 if not self.isAlive(): 219 return () 220 cursor = self.mDB.cursor() 221 cursor.execute("show full processlist") 222 all = all_as_maps(cursor) 223 cursor.close() 224 225 # Generate statistics 226 self.mNumProcesses = len(all) 227 running_ages = 0.0 228 229 self.mProcessRows = all 230 231 # Accumulate statistics for all processes 232 for row in all: 233 # Ignore backups 234 if (row['Command'] == 'Query') and (row['User'] != 'backup'): 235 self.mNumRunning += 1 236 running_ages += row['Time'] 237 elif row['Command'] == 'Sleep': 238 self.mNumSleeping += 1 239 self.mAvgAge = running_ages/self.mNumRunning 240 return all 241 242 def explain(self, database, query): 243 self.connect() 244 cursor = self.mDB.cursor() 245 if database: 246 cursor.execute("use %s" % database) 247 query = query.replace('\n',' ') 248 query = re.sub('\s+', ' ', query) 249 explain_str = "explain %s" % query 250 251 out = {} 252 try: 253 cursor.execute(explain_str) 254 except: 255 #print "Exception in explain for db: %s query:%s" % (database, query) 256 return None 257 all = all_as_maps(cursor) 258 # Reorganize join info by query 259 out['explain_str'] = explain_str 260 out['tables'] = {} 261 explain_tables = out['tables'] 262 out['analysis'] = {} 263 out['raw_explain'] = copy.deepcopy(all) 264 for row in all: 265 table = row['table'] 266 if not table: 267 table = '' 268 #if table in explain_tables: 269 # continue 270 explain_tables[table] = copy.deepcopy(row) 271 for key in row.keys(): 272 if None == key: 273 raise "Bad row:", row 274 if not row['possible_keys']: 275 if not 'no_key' in out['analysis']: 276 out['analysis']['no_key'] = [] 277 out['analysis']['no_key'].append(table) 278 return out 279 280 def safeToRunQuery(self, verbose = False): 281 """Return if it's safe to run an expensive query""" 282 # Get updates on the master 283 self.getProcessList() 284 285 # Get updates on all of the slaves 286 if not self.mSlaves: 287 self.getSlaves() 288 self.updateSlaves() 289 290 # Now, check everything we care about 291 MAX_PROCESSES = 5 # Maximum number of running processes on the master 292 293 safe = True 294 if self.mNumRunning > self.mSafeMaxProcesses: 295 safe = False 296 if verbose: 297 print "%s: Unsafe: %d running processes" % (self.mHost, self.mNumRunning) 298 if self.mAvgAge > self.mSafeMaxAvgAge: 299 safe = False 300 if verbose: 301 print "%s: Unsafe: %f average query age" % (self.mHost, self.mAvgAge) 302 303 num_slaves = self.mSlaves 304 # Check if slaves are OK, but only if we have some 305 if num_slaves: 306 healthy_slaves = 0 307 for slave in self.mSlaves.values(): 308 if slave.isSlaveHealthy(verbose): 309 healthy_slaves += 1 310 elif slave.mHost in self.mIgnoredSlaves: 311 # Pretend the slave is healthy 312 healthy_slaves += 1 313 if verbose: 314 print "Ignoring unhealthy slave %s" % slave.mHost 315 316 317 if ((self.mSafeParanoidSlaves and (healthy_slaves != len(self.mSlaves))) 318 or (not healthy_slaves)): 319 safe = False 320 if verbose: 321 print "Not enough healthy slaves (%d/%d)" % (healthy_slaves, len(self.mSlaves)) 322 323 if not safe and verbose: 324 print "%s: Unsafe to run query!" % self.mHost 325 elif verbose: 326 print "%s: Safe to run query" % self.mHost 327 return safe 328 329 def masterStatusString(self): 330 return "%s: Master: (%d/%d) AvgAge %.2f" % (self.mHost, self.mNumRunning, self.mNumProcesses, self.mAvgAge) 331 332 def dump(self): 333 """Dump useful and interesting information about this database and its slaves""" 334 print self.masterStatusString() 335 for slave in self.mSlaves.values(): 336 slave.dumpSlave() 337 338 def isAlive(self): 339 return self.mAlive 340 341 # 342 # Get all table information for a database 343 # 344 def getTableInfo(self): 345 # Connect to the DB 346 self.connect() 347 if not self.isAlive(): 348 return () 349 cursor = self.mDB.cursor() 350 cursor.execute("show databases") 351 all_dbs = all_as_maps(cursor) 352 dbs = {} 353 # Get all the databases 354 for db in all_dbs: 355 dbs[db['Database']] = {} 356 357 bad_tables = ['tmp', 'information_schema', 'mysql','secondopinion'] 358 # Iterate through all the databases and get table information. 359 for db in dbs.keys(): 360 dbs[db]['tables'] = {} 361 if db in bad_tables: 362 continue 363 print "Database:", db 364 print "----------------------" 365 cursor.execute("use %s" % db) 366 cursor.execute("show table status") 367 all_tables = all_as_maps(cursor) 368 print "Name Rows Data Index" 369 print "---------------------" 370 for table in all_tables: 371 dbs[db]['tables'][table['Name']] = copy.deepcopy(table) 372 print table['Name'],table['Rows'], table['Data_length'], table['Index_length'] 373 print 374 return dbs 375 376 # 377 # Get all tables and their fields for the sequence and indra DBs 378 # as a nested dict 379 def getTableFields(self): 380 # Connect to the DB 381 self.connect() 382 if not self.isAlive(): 383 return () 384 cursor = self.mDB.cursor() 385 dbs = {'sequence':{},'indra':{}} 386 # Iterate through all the databases and get table information. 387 for db in dbs.keys(): 388 cursor.execute("use %s" % db) 389 cursor.execute("show table status") 390 all_tables = all_as_maps(cursor) 391 for table in all_tables: 392 cursor.execute("desc %s" % table['Name']) 393 dbs[db][table['Name']] = [row[0] for row in cursor.fetchall()] 394 return dbs 395 396 # 397 # Slave management methods for master 398 # 399 def getSlaves(self): 400 """Get a list of all of the slaves of this mysql host""" 401 procs = self.getProcessList() 402 self.clearSlaves() 403 for row in procs: 404 if row['Command'] == 'Binlog Dump': 405 host, port = row['Host'].split(':') 406 hostname = socket.gethostbyaddr(host)[0] 407 slave = DBMonitor(hostname, self.mUser, self.mPassword) 408 slave.mIsSlave = True 409 self.mSlaves[hostname] = slave 410 #print self.mSlaves 411 412 def clearSlaves(self): 413 """Cleanup all slave DBs""" 414 self.mSlaves = {} 415 416 def updateSlaves(self): 417 """Update the status of all of the slave DBs""" 418 for slave in self.mSlaves.values(): 419 slave.updateSlaveStatus() 420 pass 421 422 # 423 # Slave management methods for slave 424 # 425 def updateSlaveStatus(self): 426 """Get information about slave status""" 427 428 # Flush existing data 429 self.mMaster = "" 430 self.mSlaveRunning = False 431 self.mSlaveBehind = 0 432 433 # Connect to the database 434 self.connect() 435 if not self.isAlive(): 436 return 437 cursor = self.mDB.cursor() 438 cursor.execute("show slave status") 439 all = all_as_maps(cursor) 440 cursor.close() 441 442 # Pull data from result 443 row = all[0] 444 self.mMaster = row['Master_Host'] 445 self.mSlaveRunning = ('Yes' == row['Slave_SQL_Running']) and ('Yes' == row['Slave_IO_Running']) 446 self.mSlaveBehind = row['Seconds_Behind_Master'] 447 if None == self.mSlaveBehind: 448 self.mSlaveBehind = 99999 449 450 # Update process list stats 451 self.getProcessList() 452 453 def isSlaveHealthy(self, verbose = False): 454 healthy = True 455 if not self.isAlive(): 456 healthy = False 457 if verbose: 458 print "%s: Unable to connect to database!" % self.mHost 459 if not self.mSlaveRunning: 460 healthy = False 461 if verbose: 462 print "%s: Slave is not running!" % self.mHost 463 if self.mSlaveBehind > MAX_SLAVE_BEHIND: 464 healthy = False 465 if verbose: 466 print "%s: Slave is %d seconds behind" % (self.mHost, self.mSlaveBehind) 467 if self.mNumRunning > MAX_SLAVE_RUNNING: 468 healthy = False 469 if verbose: 470 print "%s: Slave has %d running processes" % (self.mHost, self.mNumRunning) 471 #if self.mAvgAge > MAX_SLAVE_AVG_AGE: 472 # healthy = False 473 # if verbose: 474 # print "%s: Slave has %f average age" % (self.mHost, self.mAvgAge) 475 return healthy 476 477 def slaveStatusString(self): 478 return "%s: Slave: Run: %s\tBehind: %d\tRun Procs: %d\tAvgAge: %.2f" \ 479 % (self.mHost, str(self.mSlaveRunning), self.mSlaveBehind, self.mNumRunning, self.mAvgAge) 480 481 def dumpSlave(self): 482 """Dump slave-specific info""" 483 print self.slaveStatusString() 484 485def main(): 486 """Simple test stub which dumps how happy a particular database is.""" 487 db_host = sys.argv[1] 488 dbm = DBMonitor(db_host) 489 while 1: 490 dbm.safeToRunQuery(True) 491 dbm.dump() 492 print 493 time.sleep(1.0) 494 495if __name__ == "__main__": 496 main()