PageRenderTime 44ms CodeModel.GetById 22ms app.highlight 18ms RepoModel.GetById 1ms app.codeStats 0ms

/mysql_watcher/dblibs/dbmonitor.py

https://bitbucket.org/lindenlab/apiary/
Python | 496 lines | 428 code | 20 blank | 48 comment | 19 complexity | d37f656c5f7764cdf995c74df75a0b8e MD5 | raw file
  1#!/usr/bin/env python
  2#
  3# $LicenseInfo:firstyear=2007&license=mit$
  4# 
  5# Copyright (c) 2007-2010, Linden Research, Inc.
  6# 
  7# Permission is hereby granted, free of charge, to any person obtaining a copy
  8# of this software and associated documentation files (the "Software"), to deal
  9# in the Software without restriction, including without limitation the rights
 10# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11# copies of the Software, and to permit persons to whom the Software is
 12# furnished to do so, subject to the following conditions:
 13# 
 14# The above copyright notice and this permission notice shall be included in
 15# all copies or substantial portions of the Software.
 16# 
 17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23# THE SOFTWARE.
 24# $/LicenseInfo$
 25#
 26
 27#
 28# Utility classes that allow us to monitor and keep track of databases
 29#
 30import copy
 31import MySQLdb
 32from dbutil import *
 33
 34MAX_SLAVE_BEHIND = 2 # Maximum number of seconds a slave can be behind
 35MAX_SLAVE_RUNNING = 4 # Maximum number of running processes allowed on the slave
 36MAX_SLAVE_AVG_AGE = 2 # Maximum average for all running queries
 37
 38
 39def safe_delete(host, delete_sql, num_rows = 0, verbose = False, stop_time = 0, paranoid = False, batch = True, ignored_slaves = ()):
 40    """Safely do a bulk delete using the delete string on the specified database host
 41    Returns the number of rows deleted, or -1 if there was an error"""
 42
 43    # Validate the incoming statement
 44    if re.compile(".*\s+limit\s+\d+").match(delete_sql.lower()):
 45        print "Query '%s' appears to have a LIMIT clause, aborting" % delete_sql
 46        return -1
 47
 48    dbm = DBMonitor(host)
 49    dbm.mSafeParanoidSlaves = paranoid
 50    dbm.mIgnoredSlaves = ignored_slaves
 51    deleted = 0
 52
 53    MIN_ROWS = 500 # Minimum number of rows to delete at once
 54    MAX_ELAPSED = 15.0 # Elapsed time to target for a single delete
 55
 56    # Dynamically adapt these variables to ensure optimum delete rates
 57    limit_rows = MIN_ROWS
 58    sleep_time = 1.0
 59    while 1:
 60        # Exit or continue based on the stop time
 61        if stop_time and (stop_time < time.time()):
 62            # Always print this
 63            if verbose:
 64                print "Time has run out, %d deletes completed!" % deleted
 65            return ("Timeout", deleted)
 66
 67        safe = dbm.safeToRunQuery(verbose)
 68        if verbose:
 69            print time.ctime()
 70            dbm.dump()
 71        sleep_time = 1.0 # Sleep for 1 second between attempts, by default
 72
 73        # Run the query if safe
 74        if safe:
 75            db = dbm.connect(True)
 76            if db:
 77                cursor = db.cursor()
 78                if batch:
 79                    full_query = delete_sql + " LIMIT %d" % limit_rows
 80                else:
 81                    full_query = delete_sql
 82                if verbose:
 83                    print "Executing %s" % full_query
 84                    sys.stdout.flush()
 85                begin_time = time.time()
 86                cursor.execute(full_query)
 87                end_time = time.time()
 88                elapsed = end_time - begin_time
 89
 90                # Exit or continue based on number of rows deleted
 91                if cursor.rowcount < 0:
 92                    return ("Error", deleted)
 93                else:
 94                    # Increment the number of deleted rows
 95                    deleted += cursor.rowcount
 96
 97                if cursor.rowcount < limit_rows:
 98                    # We've deleted everything, as otherwise we would have deleted limit_rows
 99                    return ("OK", deleted)
100
101                if num_rows and (deleted >= num_rows):
102                    # If we've specified a number of rows to try to delete and we've hit it, exit
103                    return ("OK", deleted)
104
105                # Adapt the number of rows to delete based on execution time
106                limit_rows = limit_rows*(MAX_ELAPSED/elapsed)
107                limit_rows = int((limit_rows//100)*100)
108                limit_rows = max(MIN_ROWS, limit_rows)
109
110                # Clamp us to delete exactly the specified number of rows if we have num_rows
111                if num_rows:
112                    limit_rows = min(num_rows - deleted, limit_rows)
113                
114                # Sleep for as long as the delete took before trying again, this should
115                # give us a reasonable sleep time
116                sleep_time = min(2*MAX_SLAVE_BEHIND, max(0.25, elapsed))
117                if verbose:
118                    print time.ctime()
119                    print "Total deleted: %d" % deleted
120                    print "Elapsed time: %s" % elapsed
121                cursor.close()
122        if verbose:
123            print
124            sys.stdout.flush()
125        time.sleep(sleep_time)
126
127def log_delete(host, query, end_time, table, paranoid = True, verbose = False, batch = True):
128    """A convenience function for cleanups inside cron jobs"""
129    print time.ctime(), "Deleting from %s" % table
130    (status, count) = safe_delete(host, query, verbose = verbose, stop_time = end_time, paranoid = paranoid, batch = batch)
131    if "Error" == status:
132        print >> sys.stderr, time.ctime(), "%s: %s failed" % (table, query)
133        return True
134    elif "Timeout" == status:
135        print >> sys.stderr, time.ctime(), "%s: Timeout, %d rows deleted" % (table, count)
136        return True
137    else:
138        print time.ctime(), "%s: %d rows deleted" % (table, count)
139        return False
140
141class DBMonitor:
142    """Monitor a DB and all of its slaves for health"""
143    def __init__(self, host, user, password):
144        # Generic database information
145        self.mHost = host
146        self.mUser = user
147        self.mPassword = password
148        self.mAlive = False
149        self.mNumProcesses = 0
150        self.mNumRunning = 0
151        self.mNumSleeping = 0
152        self.mAvgAge = 0.0
153        self.mIsSlave = False
154        self.mDB = None
155        self.mProcessRows = None # Contains the results of the last process row queries
156
157        # Master specific info
158        self.mSlaves = None # A list of all slaves
159
160        # Slave specific info
161        self.mMaster = ""
162        self.mSlaveRunning = False
163        self.mSlaveBehind = 0
164
165        self.mIgnoredSlaves = () # A list of slaves to ignore
166
167        # Database "health" options
168        self.mSafeParanoidSlaves = True # All slaves have to be healthy
169        self.mSafeMaxProcesses = 5      # Max processes to allow to run on this host when safe
170        self.mSafeMaxAvgAge = 300       # If multiple queries, maximum allowed average age
171        
172    def connect(self, force_reconnect = False):
173        """Connect to the database - flush the existing connection if we're already connected."""
174        if self.mDB and force_reconnect:
175            self.mDB.close()
176            self.mDB = None
177
178        if not self.mDB:
179            try:
180                self.mDB = MySQLdb.connect(self.mHost,
181                                           self.mUser,
182                                           self.mPassword)
183            except:
184                print "Failed to connect to database on %s" % self.mHost
185                self.mDB = None
186                self.mAlive = False
187                return None
188
189            self.mDB.autocommit(True)
190
191        self.mAlive = True
192        return self.mDB
193    def disconnect(self):
194        self.mDB.close()
195        self.mDB = None
196    def killProcess(self, pid):
197        self.connect(force_reconnect=False)
198        if self.mDB:
199            cursor = self.mDB.cursor()
200            try:
201                cursor.execute("kill %d" % pid)
202            except:
203                pass
204            cursor.close()
205        else:
206            print "Couldn't get database"
207
208    def getProcessList(self):
209        """Return all of the running processes on the DB in a sequence of row maps."""
210
211        # Flush existing statistics
212        self.mNumProcesses = 0
213        self.mNumRunning = 0
214        self.mNumSleeping = 0
215        
216        # Connect to the DB
217        self.connect()
218        if not self.isAlive():
219            return ()
220        cursor = self.mDB.cursor()
221        cursor.execute("show full processlist")
222        all = all_as_maps(cursor)
223        cursor.close()
224
225        # Generate statistics
226        self.mNumProcesses = len(all)
227        running_ages = 0.0
228
229        self.mProcessRows = all
230
231        # Accumulate statistics for all processes
232        for row in all:
233            # Ignore backups
234            if (row['Command'] == 'Query') and (row['User'] != 'backup'):
235                self.mNumRunning += 1
236                running_ages += row['Time']
237            elif row['Command'] == 'Sleep':
238                self.mNumSleeping += 1
239        self.mAvgAge = running_ages/self.mNumRunning
240        return all
241
242    def explain(self, database, query):
243        self.connect()
244        cursor = self.mDB.cursor()
245        if database:
246            cursor.execute("use %s" % database)
247        query = query.replace('\n',' ')
248        query = re.sub('\s+', ' ', query)
249        explain_str = "explain %s" % query
250
251        out = {}
252        try:
253            cursor.execute(explain_str)
254        except:
255            #print "Exception in explain for db: %s query:%s" % (database, query)
256            return None
257        all = all_as_maps(cursor)
258        # Reorganize join info by query
259        out['explain_str'] = explain_str
260        out['tables'] = {}
261        explain_tables = out['tables']
262        out['analysis'] = {}
263        out['raw_explain'] = copy.deepcopy(all)
264        for row in all:
265            table = row['table']
266            if not table:
267                table = ''
268            #if table in explain_tables:
269            #    continue
270            explain_tables[table] = copy.deepcopy(row)
271            for key in row.keys():
272                if None == key:
273                    raise "Bad row:", row
274            if not row['possible_keys']:
275                if not 'no_key' in out['analysis']:
276                    out['analysis']['no_key'] = []
277                out['analysis']['no_key'].append(table)
278        return out
279
280    def safeToRunQuery(self, verbose = False):
281        """Return if it's safe to run an expensive query"""
282        # Get updates on the master
283        self.getProcessList()
284        
285        # Get updates on all of the slaves
286        if not self.mSlaves:
287            self.getSlaves()
288        self.updateSlaves()
289
290        # Now, check everything we care about
291        MAX_PROCESSES = 5 # Maximum number of running processes on the master
292
293        safe = True
294        if self.mNumRunning > self.mSafeMaxProcesses:
295            safe = False
296            if verbose:
297                print "%s: Unsafe: %d running processes" % (self.mHost, self.mNumRunning)
298        if self.mAvgAge > self.mSafeMaxAvgAge:
299            safe = False
300            if verbose:
301                print "%s: Unsafe: %f average query age" % (self.mHost, self.mAvgAge)
302
303        num_slaves = self.mSlaves
304        # Check if slaves are OK, but only if we have some
305        if num_slaves:
306            healthy_slaves = 0
307            for slave in self.mSlaves.values():
308                if slave.isSlaveHealthy(verbose):
309                    healthy_slaves += 1
310                elif slave.mHost in self.mIgnoredSlaves:
311                    # Pretend the slave is healthy
312                    healthy_slaves += 1
313                    if verbose:
314                        print "Ignoring unhealthy slave %s" % slave.mHost
315
316
317            if ((self.mSafeParanoidSlaves and (healthy_slaves != len(self.mSlaves)))
318                or (not healthy_slaves)):
319                safe = False
320                if verbose:
321                    print "Not enough healthy slaves (%d/%d)" % (healthy_slaves, len(self.mSlaves))
322        
323        if not safe and verbose:
324            print "%s: Unsafe to run query!" % self.mHost
325        elif verbose:
326            print "%s: Safe to run query" % self.mHost
327        return safe
328
329    def masterStatusString(self):
330        return "%s: Master: (%d/%d) AvgAge %.2f" % (self.mHost, self.mNumRunning, self.mNumProcesses, self.mAvgAge)
331
332    def dump(self):
333        """Dump useful and interesting information about this database and its slaves"""
334        print self.masterStatusString()
335        for slave in self.mSlaves.values():
336            slave.dumpSlave()
337
338    def isAlive(self):
339        return self.mAlive
340
341    #
342    # Get all table information for a database
343    #
344    def getTableInfo(self):
345        # Connect to the DB
346        self.connect()
347        if not self.isAlive():
348            return ()
349        cursor = self.mDB.cursor()
350        cursor.execute("show databases")
351        all_dbs = all_as_maps(cursor)
352        dbs = {}
353        # Get all the databases
354        for db in all_dbs:
355            dbs[db['Database']] = {}
356
357        bad_tables = ['tmp', 'information_schema', 'mysql','secondopinion']
358        # Iterate through all the databases and get table information.
359        for db in dbs.keys():
360            dbs[db]['tables'] = {}
361            if db in bad_tables:
362                continue
363            print "Database:", db
364            print "----------------------"
365            cursor.execute("use %s" % db)
366            cursor.execute("show table status")
367            all_tables = all_as_maps(cursor)
368            print "Name Rows Data Index"
369            print "---------------------"
370            for table in all_tables:
371                dbs[db]['tables'][table['Name']] = copy.deepcopy(table)
372                print table['Name'],table['Rows'], table['Data_length'], table['Index_length']
373            print
374        return dbs
375
376    #
377    # Get all tables and their fields for the sequence and indra DBs
378    # as a nested dict
379    def getTableFields(self):
380        # Connect to the DB
381        self.connect()
382        if not self.isAlive():
383            return ()
384        cursor = self.mDB.cursor()
385        dbs = {'sequence':{},'indra':{}}
386        # Iterate through all the databases and get table information.
387        for db in dbs.keys():
388            cursor.execute("use %s" % db)
389            cursor.execute("show table status")
390            all_tables = all_as_maps(cursor)
391            for table in all_tables:
392                cursor.execute("desc %s" % table['Name'])
393                dbs[db][table['Name']] = [row[0] for row in cursor.fetchall()]
394        return dbs
395
396    #
397    # Slave management methods for master
398    #
399    def getSlaves(self):
400        """Get a list of all of the slaves of this mysql host"""
401        procs = self.getProcessList()
402        self.clearSlaves()
403        for row in procs:
404            if row['Command'] == 'Binlog Dump':
405                host, port = row['Host'].split(':')
406                hostname = socket.gethostbyaddr(host)[0]
407                slave = DBMonitor(hostname, self.mUser, self.mPassword)
408                slave.mIsSlave = True
409                self.mSlaves[hostname] = slave
410        #print self.mSlaves
411
412    def clearSlaves(self):
413        """Cleanup all slave DBs"""
414        self.mSlaves = {}
415
416    def updateSlaves(self):
417        """Update the status of all of the slave DBs"""
418        for slave in self.mSlaves.values():
419            slave.updateSlaveStatus()
420        pass
421
422    #
423    # Slave management methods for slave
424    #
425    def updateSlaveStatus(self):
426        """Get information about slave status"""
427
428        # Flush existing data
429        self.mMaster = ""
430        self.mSlaveRunning = False
431        self.mSlaveBehind = 0
432        
433        # Connect to the database
434        self.connect()
435        if not self.isAlive():
436            return
437        cursor = self.mDB.cursor()
438        cursor.execute("show slave status")
439        all = all_as_maps(cursor)
440        cursor.close()
441
442        # Pull data from result
443        row = all[0]
444        self.mMaster = row['Master_Host']
445        self.mSlaveRunning = ('Yes' == row['Slave_SQL_Running']) and ('Yes' == row['Slave_IO_Running'])
446        self.mSlaveBehind = row['Seconds_Behind_Master']
447        if None == self.mSlaveBehind:
448            self.mSlaveBehind = 99999
449
450        # Update process list stats
451        self.getProcessList()
452
453    def isSlaveHealthy(self, verbose = False):
454        healthy = True
455        if not self.isAlive():
456            healthy = False
457            if verbose:
458                print "%s: Unable to connect to database!" % self.mHost
459        if not self.mSlaveRunning:
460            healthy = False
461            if verbose:
462                print "%s: Slave is not running!" % self.mHost
463        if self.mSlaveBehind > MAX_SLAVE_BEHIND:
464            healthy = False
465            if verbose:
466                print "%s: Slave is %d seconds behind" % (self.mHost, self.mSlaveBehind)
467        if self.mNumRunning > MAX_SLAVE_RUNNING:
468            healthy = False
469            if verbose:
470                print "%s: Slave has %d running processes" % (self.mHost, self.mNumRunning)
471        #if self.mAvgAge > MAX_SLAVE_AVG_AGE:
472        #   healthy = False
473        #   if verbose:
474        #       print "%s: Slave has %f average age" % (self.mHost, self.mAvgAge)
475        return healthy
476
477    def slaveStatusString(self):
478        return "%s: Slave: Run: %s\tBehind: %d\tRun Procs: %d\tAvgAge: %.2f" \
479               % (self.mHost, str(self.mSlaveRunning), self.mSlaveBehind, self.mNumRunning, self.mAvgAge)
480        
481    def dumpSlave(self):
482        """Dump slave-specific info"""
483        print self.slaveStatusString()
484
485def main():
486    """Simple test stub which dumps how happy a particular database is."""
487    db_host = sys.argv[1]
488    dbm = DBMonitor(db_host)
489    while 1:
490        dbm.safeToRunQuery(True)
491        dbm.dump()
492        print
493        time.sleep(1.0)
494
495if __name__ == "__main__":
496    main()