dbutil.py - This Python code appears to be part of a log fi…

/mysql_watcher/dblibs/dbutil.py

https://bitbucket.org/lindenlab/apiary/ · Python · 1253 lines · 809 code · 173 blank · 271 comment · 157 complexity · 3345fa671f7633ad7462092c7cba8b87 MD5 · raw file

#!/usr/bin/env python

#
# $LicenseInfo:firstyear=2007&license=mit$
# 
# Copyright (c) 2007-2010, Linden Research, Inc.
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# $/LicenseInfo$
#


#
# Utility classes that allow us to monitor and keep track of databases
#
import array
import binascii
import gzip
import math
import os
import re
import socket
import string
import struct
import sys
import time

from llbase import llsd

def asciify(str):
    "Lame ASCIIfication of a string to keep various things from barfing"
    out_str = ""
    for ch in str:
        if (ch >= chr(0x9)) and (ch <= '~'):
            out_str += ch
        else:
            out_str += "."
    return out_str

def all_as_maps(cursor):
    """Return all of the cursor with maps for each row instead of sequences"""
    all_seq = cursor.fetchall()
    ret_all = []
    descs = cursor.description
    for row in all_seq:
        new_row = {}
        count = 0
        for desc in descs:
            new_row[desc[0]] = row[count]
            count += 1
        ret_all.append(new_row)
    return ret_all

#
# Cache IP to string lookup to make it faster
#
ip_table = {}
def lookup_ip_string(ip_bin):
    if not ip_bin in ip_table:
        ip_table[ip_bin] = "%d.%d.%d.%d" % ((ip_bin & 0xff000000L) >> 24,
                                            (ip_bin & 0x00ff0000L) >> 16,
                                            (ip_bin & 0x0000ff00L) >> 8,
                                            ip_bin & 0x000000ffL)
    return ip_table[ip_bin]

def llquery_from_llsd(query_llsd):
    # Hack, fill in arbitary data for info that isn't serialized
    query = LLQuery(None, None, query_llsd['query'], 0.0)
    query.mData['host_clean'] = query_llsd['host_clean']
    query.mData['query_clean'] = query_llsd['query_clean']

    # Hack, keeps correctOutliers from trashing the data
    #query.mNumQueries = query_llsd['num_queries']
    #query.mTotalTime = query_llsd['total_time']
    try:
        query.mNumQueriesCorrected = query_llsd['num_queries_corrected']
        query.mTotalTimeCorrected = query_llsd['total_time_corrected']
    except:
        # Hack for old output which didn't generate this data
        query.mNumQueriesCorrected = query_llsd['num_queries']
        query.mTotalTimeCorrected = query_llsd['total_time']
        
    return query

def get_query_tables(query):
    "Return the list of tables in a query"
    #
    # Really dumb method, literally iterates through a bunch of regular expressions to pull this out.
    # There are probably better methods out there.
    #
    
    out_tables = []
    # Clean up the query
    query = query.replace('\n',' ')
    query = re.sub('\s+', ' ', query)
    
    m = LLQuery.sSelectWhereRE.match(query)
    if m:
        # Split apart by commas
        tables = m.group(1).split(',')
        for table in tables:
            # Take the first part (which is table name)
            out_tables.append(string.strip(table.split()[0]))
        return out_tables
    
    m = LLQuery.sSelectRE.match(query)
    if m:
        out_tables.append(string.strip(m.group(1)))
        return out_tables
        
    m = LLQuery.sUpdateRE.match(query)
    if m:
        # Split apart by commas
        tables = m.group(1).split(',')
        for table in tables:
            # Take the first part (which is table name)
            out_tables.append(string.strip(table.split()[0]))
        return out_tables

    m = LLQuery.sReplaceRE.match(query)
    if m:
        out_tables.append(string.strip(m.group(1)))
        return out_tables
    
    m = LLQuery.sInsertRE.match(query)
    if m:
        out_tables.append(string.strip(m.group(1)))
        return out_tables

    m = LLQuery.sDeleteRE.match(query)
    if m:
        out_tables.append(string.strip(m.group(1)))
        return out_tables
    return out_tables


MIN_BIN=-15
MAX_BIN=10
class LLQuery:
    "Represents all of the data associated with a query"
    fromLLSDStats = staticmethod(llquery_from_llsd)
    def __init__(self, host, port, query, start_time):
        # Store information which will be serialized for metadata in a map
        self.mData = {}
        self.mData['host'] = host
        self.mData['port'] = port
        self.mData['query'] = query

        # Metadata
        self.mData['host_clean'] = None
        self.mData['host_full'] = None
        self.mData['query_clean'] = None
        self.mData['tables'] = []

        #
        # Stats information
        #
        self.mNumQueries = 0
        self.mTotalTime = 0.0
        self.mOutQueries = 0
        self.mTotalTimeCorrected = 0.0 # Corrected to remove outliers
        self.mNumQueriesCorrected = 0 # Corrected to remove outliers

        # LLQueryStatBins for the query time histogram, as well as corrected time
        # Query times are collected into bins based on power of 2 execution times (in seconds).
        # Each bin collects the number of queries and total execution time. See LLQueryStatBin
        # for more details
        self.mBins = {} # Bins for histogram

        # This stuff doesn't usually get serialized
        self.mQueryLen = len(query)
        self.mStartTime = start_time
        self.mResponseTime = start_time

    def __hash__(self):
        return (self.mData['host_clean'] + ":" + self.mData['query_clean']).__hash__()

    def __eq__(self, other):
        # Note, this matches on clean, not strictly correct
        if ((self.mData['query_clean'] == other.mData['query_clean']) and
            (self.mData['host_clean'] == other.mData['host_clean'])):
            return True
        return False

    def getKey(self):
        # The string key is just the clean host and query, concatenated
        return self.mData['host_clean'] + ":" + self.mData['query_clean']
        
    def clean(self):
        "Generate the clean query so it can be used for statistics"
        if not self.mData['host_clean']:
            (self.mData['host_clean'], self.mData['host_full']) = get_host_type(self.mData['host'])
            self.mData['query_clean'] = clean_query(self.mData['query'], 0)

    def getAvgTimeCorrected(self):
        "Average time per query, corrected for outliers"
        return self.mTotalTimeCorrected/self.mNumQueriesCorrected

    def queryStart(self):
        "When collecting query stats, use this when the query is receieved"
        self.mNumQueries += 1
        self.mOutQueries += 1

    def queryResponse(self, elapsed):
        "When collecting stats, use this when the response is received"
        self.mTotalTime += elapsed
        self.mOutQueries -=1

        # Determine which stat bin this query is in
        bin = MIN_BIN
        if elapsed:
            bin = int(math.log(elapsed,2))
        bin = max(MIN_BIN, bin)
        bin = min(MAX_BIN, bin)
        if bin not in self.mBins:
            self.mBins[bin] = LLQueryStatBin(bin)
        self.mBins[bin].accumulate(elapsed)

    def correctOutliers(self):
        "Find outliers bins and calculate corrected results"
        # Outlier bins have query counts which are 3 orders of magnitude less than the total count for that query
        if not self.mNumQueries:
            # FIXME: This is a hack because we don't save this information in the query count dump
            return
        min_queries = self.mNumQueries/100
        self.mTotalTimeCorrected = 0.0
        self.mNumQueriesCorrected = 0
        for i in self.mBins.keys():
            if self.mBins[i].mNumQueries < min_queries:
                # Outlier, flag as such.
                self.mBins[i].mOutlier = True
            else:
                self.mTotalTimeCorrected += self.mBins[i].mTotalTime
                self.mNumQueriesCorrected += self.mBins[i].mNumQueries
        if self.mNumQueriesCorrected == 0:
            #HACK: Deal with divide by zero
            self.mNumQueriesCorrected = 1

    # Miscellaneous regular expressions to analyze the query type
    sReadRE = re.compile("(SELECT.*)|(USE.*)", re.IGNORECASE)
    sSelectWhereRE = re.compile("\(?\s*?SELECT.+?FROM\s+\(?(.*?)\)?\s+WHERE.*", re.IGNORECASE)
    sSelectRE = re.compile("\(?\s*?SELECT.+?FROM\s+(.+)(?:\s+LIMIT.*|.*)", re.IGNORECASE)
    sUpdateRE = re.compile("UPDATE\s+(.+?)\s+SET.*", re.IGNORECASE)
    sReplaceRE = re.compile("REPLACE INTO\s+(.+?)(?:\s*\(|\s+SET).*", re.IGNORECASE)
    sInsertRE = re.compile("INSERT.+?INTO\s+(.+?)(?:\s*\(|\s+SET).*", re.IGNORECASE)
    sDeleteRE = re.compile("DELETE.+?FROM\s+(.+?)\s+WHERE.*", re.IGNORECASE)
    def analyze(self):
        "Does some query analysis on the query"
        query = self.mData['query_clean']
        self.mData['tables'] = get_query_tables(query)
        if 'type' in self.mData:
            # Already analyzed
            return
        if LLQuery.sReadRE.match(query):
            self.mData['type'] = 'read'
        else:
            self.mData['type'] = 'write'


    def dumpLine(self, elapsed, query_len = 0):
        "Dump a semi-human-readable stats line for reporting"
        bin_str = ''
        for i in range(MIN_BIN,MAX_BIN+1):
            if i in self.mBins:
                if self.mBins[i].mOutlier:
                    bin_str += '*'
                else:
                    bin_str += str(int(math.log10(self.mBins[i].mNumQueries)))
            else:
                bin_str += '.'
        if not query_len:
            query_len = 4096
        num_queries = self.mNumQueriesCorrected
        if not num_queries:
            num_queries = 1
        return ("%s\t%5d\t%6.2f\t%6.2f\t%1.4f\t%s\t" % (bin_str, num_queries,
                                                       num_queries/elapsed, self.mTotalTimeCorrected,
                                                       self.mTotalTimeCorrected/num_queries, self.mData['host_clean'])) \
                                                       + self.mData['query_clean'][0:query_len]

    def as_map(self):
        "Make an LLSD map version of data that can be used for merging"
        self.analyze()
        self.mData['num_queries'] = self.mNumQueries
        self.mData['total_time'] = self.mTotalTime
        self.mData['num_queries_corrected'] = self.mNumQueriesCorrected
        self.mData['total_time_corrected'] = self.mTotalTimeCorrected
        return self.mData

class LLConnStatus:
    "Keeps track of the status of a connection talking to mysql"
    def __init__(self, ip_port, start_time):
        self.mLastMysqlPacketNumber = 0
        self.mNumPackets = 0
        self.mIPPort = ip_port
        self.mStartTime = start_time
        self.mLastUpdate = start_time
        self.mCurState = ""
        self.mLastQuery = None
        self.mNumQueries = 0

    def quit(self, src_ip, src_port, pkt_time):
        query = LLQuery(src_ip, src_port, "Quit", pkt_time)
        query.clean()
        self.mLastUpdate = pkt_time
        self.mLastQuery = query
        self.mNumPackets += 1

    def queryStart(self, src_ip, src_port, pkt_time, raw, pkt_len, offset):
        query_len = pkt_len - 1
        query = LLQuery(src_ip, src_port, raw[offset:offset + (pkt_len - 1)], pkt_time)
        self.mLastUpdate = pkt_time
        # Packet length includes the command, offset into raw doesn't
        if query_len > (len(raw) - offset):
            query.mQueryLen = query_len
            self.mCurState = "SendingQuery"
        else:
            self.mCurState = "QuerySent"
            query.clean()
        self.mNumQueries += 1
        self.mLastQuery = query
        self.mNumPackets += 1

    def queryStartProcessed(self, src_ip, src_port, pkt_time, query_str):
        query = LLQuery(src_ip, src_port, query_str, pkt_time)
        query.clean()
        self.mLastUpdate = pkt_time
        self.mCurState = "QuerySent"
        self.mNumQueries += 1
        self.mLastQuery = query
        self.mNumPackets += 1

    def updateNonCommand(self, pkt_time, raw):
        # Clean up an existing query if you get a non-command.
        self.mNumPackets += 1
        self.mLastUpdate = pkt_time
        if self.mLastQuery:
            if self.mCurState == "SendingQuery":
                # We're continuing a query
                # We won't generate a new clean version, because it'll $!@# up all the sorting.
                self.mLastQuery.mData['query'] += raw
                if len(self.mLastQuery.mData['query']) == self.mLastQuery.mQueryLen:
                    self.mCurState = "QuerySent"
                    self.mLastQuery.clean()
                return
            else:
                #
                # A non-command that's continuing a query. Not sure why this is happening,
                # but clear the last query to avoid generating inadvertent long query results.
                #
                self.mLastQuery = None
        # Default to setting state to "NonCommand"
        self.mCurState = "NonCommand"

    def updateResponse(self, pkt_time, result_type):
        # If we've got a query running, accumulate the elapsed time
        start_query_response = False
        if self.mCurState == "QuerySent":
            lq = self.mLastQuery
            if lq:
                if lq.mStartTime == 0.0:
                    lq.mStartTime = pkt_time
                lq.mResponseTime = pkt_time
                start_query_response = True

        self.mLastUpdate = pkt_time
        if result_type == 0:
            self.mCurState = "Result:RecvOK"
        elif result_type == 0xff:
            self.mCurState = "Result:Error"
        elif result_type == 0xfe:
            self.mCurState = "Result:EOF"
        elif result_type == 0x01:
            self.mCurState = "Result:Header"
        else:
            self.mCurState = "Result:Data"
        return start_query_response

    def dump(self):
        if self.mLastQuery:
            print "%s: NumQ: %d State:%s\n\tLast: %s" % (self.mIPPort, self.mNumQueries, self.mCurState,
                                                         self.mLastQuery.mData['query_clean'][0:40])
        else:
            print "%s: NumQ: %d State:%s\n\tLast: None" % (self.mIPPort, self.mNumQueries, self.mCurState)

class LLQueryStatBin:
    "Keeps track of statistics for one query bin"
    def __init__(self, power):
        self.mMinTime = pow(2, power)
        self.mMaxTime = pow(2, power+1)
        self.mTotalTime = 0
        self.mNumQueries = 0
        self.mOutlier = False
    def accumulate(self, elapsed):
        self.mTotalTime += elapsed
        self.mNumQueries += 1

def dump_query_stat_header():
    return "LogHistogram (-15:10)     \tCount\tQPS\tTotal\tAvg\tHost\tQuery"


class LLQueryStatMap:
    def __init__(self, description, start_time):
        self.mDescription = description
        self.mQueryMap = {}
        self.mStartTime = start_time
        self.mFinalTime = 0
        self.mLastTime = self.mStartTime
        self.mQueryStartCount = 0
        self.mQueryResponseCount = 0

    def load(self, fn):
        "Load dumped query stats from an LLSD file"
        # Read in metadata
        in_file = open(fn)
        in_string = in_file.read()
        in_file.close()
        in_llsd = llsd.LLSD.parse(in_string)
        info = in_llsd[0]
        query_list = in_llsd[1]
        self.mDescription = info['description']
        self.mStartTime = info['start_time']
        self.mLastTime = info['last_time']
        self.mFinalTime = info['last_time']
        self.mQueryStartCount = info['query_start_count']
        self.mQueryResponseCount = info['query_response_count']
        # Iterate through all the queries, and populate the query map.
        for query_row in query_list:
            query = LLQuery.fromLLSDStats(query_row)
            self.mQueryMap[query.getKey()] = query

    def analyze(self):
        for query in self.mQueryMap.values():
            query.analyze()

    def queryStart(self, query):
        if not query in self.mQueryMap:
            #query.analyze()
            self.mQueryMap[query] = query
        self.mQueryMap[query].queryStart()
        # Update elapsed time for this map
        self.mLastTime = query.mStartTime
        if self.mLastTime < self.mStartTime:
            self.mStartTime = self.mLastTime
        if self.mLastTime > self.mFinalTime:
            self.mFinalTime = self.mLastTime
        self.mQueryStartCount += 1
        
    def queryResponse(self, query):
        if not query in self.mQueryMap:
            self.queryStart(query)
        elapsed = query.mResponseTime - query.mStartTime
        self.mQueryMap[query].queryResponse(elapsed)
        self.mLastTime = query.mResponseTime
        if self.mLastTime > self.mFinalTime:
            self.mFinalTime = self.mLastTime
        self.mQueryResponseCount += 1

    def getElapsedTime(self):
        return self.mFinalTime - self.mStartTime

    def getQPS(self):
        return self.mQueryStartCount / self.getElapsedTime()

    def correctOutliers(self):
        for query in self.mQueryMap.values():
            query.correctOutliers()

    def getSortedKeys(self, sort_by = "total_time"):
        "Gets a list of keys sorted by sort type"
        self.correctOutliers()
        
        items = self.mQueryMap.items()
        backitems = None

        if sort_by == "total_time":
            backitems = [[v[1].mTotalTimeCorrected, v[0]] for v in items]
        elif sort_by == "count":
            backitems = [[v[1].mNumQueriesCorrected, v[0]] for v in items]
        elif sort_by == "avg_time":
            backitems = [[v[1].getAvgTimeCorrected(), v[0]] for v in items]
        else:
            # Fallback, sort by total time
            backitems = [[v[1].mTotalTimeCorrected, v[0]] for v in items]

        backitems.sort()
        backitems.reverse()

        # Get the keys out of the items
        sorted = []
        for pair in backitems:
            sorted.append(pair[1])
        return sorted

    def getSortedStats(self, sort_by = "total_time", num_stats = 0):
        "Gets a list of the top queries according to sort type"
        sorted_keys = self.getSortedKeys(sort_by)

        if num_stats == 0:
            l = len(sorted_keys)
        else:
            l = min(num_stats, len(sorted_keys))

        stats = []
        for i in range(0, l):
            stats.append(self.mQueryMap[sorted_keys[i]])
        return stats

    def dumpStatus(self, sort_type = "total_time", elapsed = None):
        # Dump status according to total time
        if not elapsed:
            elapsed = self.getElapsedTime()

        sorted_stats = self.getSortedStats(sort_type)
        for query in sorted_stats:
            print query.dumpLine(elapsed, 60)

    def dumpLLSD(self, filename):
        # Analyze queries to generate metadata
        self.analyze()
        # Dump an LLSD document representing the entire object
        out = []

        # First, dump all the metadata into the first block
        info_map = {}
        info_map['description'] = self.mDescription
        info_map['start_time'] = self.mStartTime
        info_map['last_time'] = self.mLastTime
        info_map['query_start_count'] = self.mQueryStartCount
        info_map['query_response_count'] = self.mQueryResponseCount
        out.append(info_map)

        # Dump all of the query info into the second block
        sorted_stats = self.getSortedStats("total_time")
        query_list = []
        for query in sorted_stats:
            query_list.append(query.as_map())
        out.append(query_list)
        f = open(filename, "w")
        f.write(str(llsd.LLSD(out)))
        f.close()

    def dumpTiming(self, filename):
        cur_time = time.time()
        f = open(filename, "w")
        f.write(dump_query_stat_header() + "\n")
        # Sort the queries
        sorted_stats = self.getSortedStats("total_time")
        for query in sorted_stats:
            f.write(query.dumpLine(cur_time - self.mStartTime))
            f.write("\n")
        f.close()

    def dumpCountsLLSD(self, filename):
        "Dump the query statistics as an LLSD doc, for later merging with the query_info doc"

        out = []
        # Put the metadata into a map
        info_map = {}
        info_map['description'] = self.mDescription
        info_map['start_time'] = self.mStartTime
        info_map['last_time'] = self.mLastTime
        info_map['query_start_count'] = self.mQueryStartCount
        info_map['query_response_count'] = self.mQueryResponseCount
        out.append(info_map)

        sorted_stats = self.getSortedStats("total_time")
        query_list = []
        for query in sorted_stats:
            query_row = {}
            # We only want to dump identifying info and stats, not metadata
            query_row['host_clean'] = query.mData['host_clean']
            # Convert the queries to utf-8 to make sure it doesn't break XML
            try:
                u = unicode(query.mData['query_clean'])
                query_row['query_clean'] = u.encode('utf-8')
            except:
                query_row['query_clean'] = 'NON-UTF8'
            try:
                u = unicode(query.mData['query'])
                query_row['query'] = u.encode('utf-8')
            except:
                query_row['query'] = 'NON-UTF8'
            query_row['count'] = query.mNumQueriesCorrected
            query_row['total_time'] = query.mTotalTimeCorrected
            query_row['avg_time'] = query.getAvgTimeCorrected()
            query_list.append(query_row)

        out.append(query_list)
        f = open(filename, "w")
        f.write(str(llsd.LLSD(out)))
        f.close()


class LLBinnedQueryStats:
    "Keeps track of a fixed number of N minute bins of query stats"
    def __init__(self):
        self.mHourBins = {} # This will be keyed by unixtime seconds, eventually
        self.mMinuteBins = {}
        self.mLastUpdateHour = 0
        self.mLastUpdateMinute = 0

    def dumpTiming(self, path):
        # Dump hour bins
        for (key, value) in self.mHourBins.items():
            value.dumpTiming("%s/hour-%s-query_timing.txt" % (path, key))
        # Dump minute bins
        for (key, value) in self.mMinuteBins.items():
            value.dumpTiming("%s/minute-%s-query_timing.txt" % (path, key))

    def dumpCountsLLSD(self, path):
        # Dump hour bins
        for (key, value) in self.mHourBins.items():
            value.dumpCountsLLSD("%s/hour-%s-query_counts.llsd" % (path, key))
        # Dump minute bins
        for (key, value) in self.mMinuteBins.items():
            value.dumpCountsLLSD("%s/minute-%s-query_counts.llsd" % (path, key))

    def dumpLLSD(self, path):
        # Dump hour bins
        for (key, value) in self.mHourBins.items():
            value.dumpLLSD("%s/hour-%s-query_dump.llsd" % (path, key))
        # Dump minute bins
        for (key, value) in self.mMinuteBins.items():
            value.dumpLLSD("%s/minute-%s-query_dump.llsd" % (path, key))

    def flushOldBins(self, time_secs):
        for minute_bin_str in self.mMinuteBins.keys():
            bin_secs = time.mktime(time.strptime(minute_bin_str, "%Y-%m-%d-%H-%M"))
            if (time_secs - bin_secs) > 3*3600:
                del self.mMinuteBins[minute_bin_str]

    def queryStart(self, query):
        "Update associated bin for the time specified, creating if necessary"
        # Hour and minute bins
        t = time.localtime(query.mStartTime)
        hour_bin_str = time.strftime("%Y-%m-%d-%H", t)
        minute_bin_str = time.strftime("%Y-%m-%d-%H-%M", t)
        hour = t[3]
        minute = t[4]
        # FIXME: These start times are a bit inaccurate, but should be fine under heavy query load.
        if not hour_bin_str in self.mHourBins:
            self.mHourBins[hour_bin_str] = LLQueryStatMap(hour_bin_str, query.mStartTime)
        if not minute_bin_str in self.mMinuteBins:
            self.mMinuteBins[minute_bin_str] = LLQueryStatMap(minute_bin_str, query.mStartTime)

        self.mHourBins[hour_bin_str].queryStart(query)
        self.mMinuteBins[minute_bin_str].queryStart(query)

        if hour != self.mLastUpdateHour:
            self.mLastUpdateHour = hour
            # If the hour changes, dump and clean out old bins
            self.flushOldBins(query.mStartTime)

    def queryResponse(self, query):
        "Update associated bin for the time specified, creating if necessary"
        # Hour and minute bins
        t = time.localtime(query.mStartTime)
        hour_bin_str = time.strftime("%Y-%m-%d-%H", t)
        minute_bin_str = time.strftime("%Y-%m-%d-%H-%M", t)
        hour = t[3]
        minute = t[4]
        # FIXME: These start times are a bit inaccurate, but should be fine under heavy query load.
        if not hour_bin_str in self.mHourBins:
            self.mHourBins[hour_bin_str] = LLQueryStatMap(hour_bin_str, query.mStartTime)
        if not minute_bin_str in self.mMinuteBins:
            self.mMinuteBins[minute_bin_str] = LLQueryStatMap(hour_bin_str, query.mStartTime)
            
        self.mHourBins[hour_bin_str].queryResponse(query)
        self.mMinuteBins[minute_bin_str].queryResponse(query)
        

# MySQL protocol sniffer, using tcpdump, ncap packet parsing and mysql internals
# http://forge.mysql.com/wiki/MySQL_Internals_ClientServer_Protocol
class LLQueryStream:
    "Process a raw tcpdump stream (in raw libpcap format)"
    def __init__(self, in_file):
        self.mInFile = in_file
        self.mStartTime = time.time()

        #
        # A list of all outstanding "connections", and what they're doing.
        # This is necessary in order to get script timing and other information.
        #
        self.mConnStatus = {}
        self.mConnKeys = []
        self.mConnCleanupIndex = 0

        #
        # Parse/skip past the libpcap global header
        #
        
        #guint32 magic_number;   /* magic number */
        #guint16 version_major;  /* major version number */
        #guint16 version_minor;  /* minor version number */
        #gint32  thiszone;       /* GMT to local correction */
        #guint32 sigfigs;        /* accuracy of timestamps */
        #guint32 snaplen;        /* max length of captured packets, in octets */
        #guint32 network;        /* data link type */

        # Skip past the libpcap global header
        format = 'IHHiIII'
        size = struct.calcsize(format)
        header_bin = self.mInFile.read(size)
        res = struct.unpack(format, header_bin)

    def createConnection(self, client_ip_port, pkt_time):
        # Track the connection, create a new one or return existing
        if not client_ip_port in self.mConnStatus:
            self.mConnStatus[client_ip_port] = LLConnStatus(client_ip_port, pkt_time)
            # Track a new key that we need to garbage collect
            self.mConnKeys.append(client_ip_port)
        conn = self.mConnStatus[client_ip_port]
        return conn
    
    def closeConnection(self, ip_port):
        if ip_port in self.mConnStatus:
            del self.mConnStatus[ip_port]

    def cleanupConnection(self,cur_time):
        # Cleanup some number of stale connections.
        CONNECTION_EXPIRY=900.0
        if self.mConnCleanupIndex >= len(self.mConnKeys):
            self.mConnCleanupIndex = 0
            # Skip if no keys
            if len(self.mConnKeys) == 0:
                return
        key = self.mConnKeys[self.mConnCleanupIndex]
        if key in self.mConnStatus:
            # Clean up if it's too old
            if self.mConnStatus[key].mLastUpdate < (cur_time - CONNECTION_EXPIRY):
                del self.mConnStatus[key]
                #print "Cleaning up old key:", key
                #print "num conns:", len(self.mConnStatus)
                #print "num keys", len(self.mConnKeys)
        else:
            # Clean up if the connection is already removed
            del self.mConnKeys[self.mConnCleanupIndex]
        self.mConnCleanupIndex += 1

    def getNextEvent(self):
        # Get the next event out of the packet stream

        td_format = 'IIII'
        ip_format = '!BBHHHBBHII'
        tcp_format = '!HHIIBBHHH'
        while 1:
            #
            # Parse out an individual packet from the tcpdump stream
            #
            # Match the packet header

            # Pull a record (packet) off of the wire

            # Packet header
            # guint32 ts_sec;         /* timestamp seconds */
            # guint32 ts_usec;        /* timestamp microseconds */
            # guint32 incl_len;       /* number of octets of packet saved in file */
            # guint32 orig_len;       /* actual length of packet */
            ph_bin = self.mInFile.read(16)
            res = struct.unpack(td_format, ph_bin)
            ts_sec = res[0]
            ts_usec = res[1]
            pkt_time = ts_sec + (ts_usec/1000000.0)
            incl_len = res[2]
            orig_len = res[3]

            # Packet data (incl_len bytes)
            raw_data = self.mInFile.read(incl_len)

            # Parse out the MAC header
            # Don't bother, we don't care - 14 byte header
            mac_offset = 14

            # Parse out the IP header (min 20 bytes)
            # 4 bits - version
            # 4 bits - header length in 32 bit words
            # 1 byte - type of service
            # 2 bytes - total length
            # 2 bytes - fragment identification
            # 3 bits - flags
            # 13 bits - fragment offset
            # 1 byte - TTL
            # 1 byte - Protocol (should be 6)
            # 2 bytes - header checksum
            # 4 bytes - source IP
            # 4 bytes - dest IP
            
            ip_header = struct.unpack(ip_format, raw_data[mac_offset:mac_offset + 20])

            # Assume all packets are TCP
            #if ip_header[6] != 6:
            #    print "Not TCP!"
            #    continue
            
            src_ip_bin = ip_header[8]
            src_ip = lookup_ip_string(src_ip_bin)
            #src_ip = "%d.%d.%d.%d" % ((src_ip_bin & 0xff000000L) >> 24,
            #                          (src_ip_bin & 0x00ff0000L) >> 16,
            #                          (src_ip_bin & 0x0000ff00L) >> 8,
            #                          src_ip_bin & 0x000000ffL)
            dst_ip_bin = ip_header[9]
            dst_ip = lookup_ip_string(dst_ip_bin)
            #dst_ip = "%d.%d.%d.%d" % ((dst_ip_bin & 0xff000000L) >> 24,
            #                          (dst_ip_bin & 0x00ff0000L) >> 16,
            #                          (dst_ip_bin & 0x0000ff00L) >> 8,
            #                          dst_ip_bin & 0x000000ffL)
            
            ip_size = (ip_header[0] & 0x0f) * 4
            

            # Parse out the TCP packet header
            # 2 bytes - src_prt
            # 2 bytes - dst_port
            # 4 bytes - sequence number
            # 4 bytes - ack number
            # 4 bits - data offset (size in 32 bit words of header
            # 6 bits - reserved
            # 6 bits - control bits
            # 2 bytes - window
            # 2 bytes - checksum
            # 2 bytes - urgent pointer

            tcp_offset = mac_offset + ip_size
            tcp_header = struct.unpack(tcp_format, raw_data[tcp_offset:tcp_offset+20])
            tcp_size = ((tcp_header[4] & 0xf0) >> 4) * 4

            src_port = tcp_header[0]
            dst_port = tcp_header[1]

            # 3 bytes - packet length
            # 1 byte - packet number
            # 1 byte - command
            # <n bytes> - args
            pkt_offset = tcp_offset + tcp_size

            if len(raw_data) == pkt_offset:
                continue

            # Clearly not a mysql packet if it's less than 5 bytes of data
            if len(raw_data) - pkt_offset < 5:
                continue

            src_ip_port = "%s:%d" % (src_ip, src_port)
            dst_ip_port = "%s:%d" % (dst_ip, dst_port)

            if src_port == 3306:
                #
                # We are processing traffic from mysql server -> client
                # This primarily is used to time how long it takes for use
                # to start receiving data to the client from the server.
                #
                mysql_arr = array.array('B', raw_data[pkt_offset])
                result_type = ord(raw_data[pkt_offset])

                # Get or create connection
                conn = self.createConnection(dst_ip_port, pkt_time)

                # Update the status of this connection, including query times on
                # connections
                if conn.updateResponse(pkt_time, result_type):
                    # Event: Initial query response
                    return "QueryResponse", conn.mLastQuery
                continue
            if dst_port == 3306:
                #
                # Processing a packet from the client to the server
                #

                # HACK! This is an easy place to put this where we can get packet time that only happens once or so per event.
                # Garbage collect connections
                self.cleanupConnection(pkt_time)

                # Pull out packet length from the header
                mysql_arr = array.array('B', raw_data[pkt_offset:pkt_offset+5])
                pkt_len = mysql_arr[0] + (long(mysql_arr[1]) << 8) + (long(mysql_arr[2]) << 16)

                pkt_number = mysql_arr[3]

                # Find the connection associated with this packet
                
                # Get or create connection
                conn = self.createConnection(src_ip_port, pkt_time)

                #if conn.mLastMysqlPacketNumber != (pkt_number - 1):
                #    print "Prev:", conn.mLastMysqlPacketNumber, "Cur:", pkt_number
                conn.mLastMysqlPacketNumber = pkt_number
                
                cmd = mysql_arr[4]
                # If we're not a command, do stuff
                if cmd > 0x1c:
                    # Unfortunately, we can't trivially tell the difference between
                    # various non-command packets
                    # Assume that these are all AuthResponses for now.

                    conn.updateNonCommand(pkt_time, raw_data[pkt_offset:])
                    if "QuerySent" == conn.mCurState:
                        return ("QueryStart", conn.mLastQuery)
                    continue

                query = None

                if cmd == 1:
                    # Event: Quitting a connection
                    conn.quit(src_ip, src_port, pkt_time)
                    # This connection is closing, get rid of it
                    self.closeConnection(src_ip_port)
                    return ("Quit", conn.mLastQuery)
                elif cmd == 3:
                    # Event: Starting a query
                    conn.queryStart(src_ip, src_port, pkt_time, raw_data, pkt_len, pkt_offset + 5)

                    # Only return an QueryStart if we have the whole query
                    if "QuerySent" == conn.mCurState:
                        return ("QueryStart", conn.mLastQuery)
                else:
                    pass

IP_PORT_RE = re.compile("(\S+):(\d+)")
EVENT_RE = re.compile("(\S+)\t(\S+):(\d+)\t(\S+)\t(\S+)")
SECTION_RE = re.compile("\*{38}")


class LLLogQueryStream:
    "Process a query stream dump to generate a query stream class"
    "Process a raw tcpdump stream (in raw libpcap format)"
    def __init__(self, lineiter):
        self.mLineIter = lineiter
        self.mStartTime = None

        #
        # A list of all outstanding "connections", and what they're doing.
        # This is necessary in order to get script timing and other information.
        #
        self.mConnStatus = {}

    def closeConnection(self, ip_port):
        if ip_port in self.mConnStatus:
            del self.mConnStatus[ip_port]

    def getNextEvent(self):
        # Get the next event out of the file
        cur_event = None
        event_time = None
        event_type = None
        ip = None
        port = None
        ip_port = None
        cur_state = 'Metadata'
        for line in self.mLineIter:
            if line == '':
                return (None, None)
            if cur_state == 'Metadata':
                # We're looking for an event.  Actually we better find one.
                m = EVENT_RE.match(line)
                if not m:
                    #raise "Missing event on line: %s" % line
                    continue
                else:
                    event_time = float(m.group(1))
                    ip = m.group(2)
                    port = int(m.group(3))
                    ip_port = m.group(2)+":"+m.group(3)
                    clean_host = m.group(4)
                    event_type = m.group(5)
                    query_str = ''
                    cur_state = 'Query'
            elif cur_state == 'Query':
                if not SECTION_RE.match(line):
                    query_str += line
                else:
                    # We're done
                    # Generate the event to return
                    # Track the connection if we don't know about it yet.
                    conn = self.createConnection(ip_port, event_time)

                    if event_type == 'QueryStart':
                        conn.queryStartProcessed(ip, port, event_time, query_str)
                        return ("QueryStart", conn.mLastQuery)
                    elif event_type == 'QueryResponse':
                        # Update the status of this connection, including query times on
                        # connections
                        # Hack: Result type defaults to zero
                        if conn.updateResponse(event_time, 0):
                            # Event: Initial query response
                            return ("QueryResponse", conn.mLastQuery)
                        else:
                            # Skip responses which we don't have the start for
                            cur_state = 'Metadata'
                    elif event_type == 'Quit':
                        # Event: Quitting a connection
                        conn.quit(ip, port, event_time)
                        # This connection is closing, get rid of it
                        self.closeConnection(ip_port)
                        return ("Quit", conn.mLastQuery)
                    else:
                        raise ("Unknown event type %s" % event_type)
        return (None, None)

def start_dump(host, port):
    # Start up tcpdump pushing data into netcat on the sql server
    interface = "eth0"
    
    # Start up tcpdump pushing data into netcat on the sql server
    SRC_DUMP_CMD = "ssh root@%s '/usr/sbin/tcpdump -p -n -s 0 -w - -i %s dst port 3306 or src port 3306 | nc %s %d'" \
                   % (host, interface, socket.getfqdn(), port)
    os.popen2(SRC_DUMP_CMD, "r")

def remote_mysql_stream(host):
    # Create a server socket, then have tcpdump dump stuff to it.
    serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

    bound = False
    port = 9999
    while not bound:
        try:
            serversocket.bind((socket.gethostname(), port))
            bound = True
        except:
            print port, " already bound, trying again"
            port += 1
    print "Bound port %d" % port
    serversocket.listen(1)

    # Fork off the dumper, start the server on the main connection
    pid = os.fork()
    if not pid:
        # Child process which gets data from the database
        time.sleep(1.0)
        print "Starting dump!"
        start_dump(host, port)
        print "Exiting dump!"
        sys.exit(0)

    print "Starting server"
    (clientsocket, address) = serversocket.accept()
    print "Accepted connection", address

    # Start listening to the data stream
    return clientsocket.makefile("rb")

#
# Utility stuff for query cleaner
#
# This is a Python port of (part of) the fingerprint() function from
# the mk-query-digest script in Maatkit, added by Yoz, with various additions/tweaks

hex_wildcard = r"[0-9a-f]"
word = hex_wildcard + r"{4}-"
long_word = hex_wildcard + r"{8}-"
very_long_word = hex_wildcard + r"{12}"
UUID_REGEX_STRING = long_word + word + word + word + very_long_word

hex_re = re.compile("^[\da-f]+$",re.I)
uuid_re = re.compile("^"+UUID_REGEX_STRING+"$",re.I)

def string_replace(match):
    "Called by string-matching regexp in replacers"
    if uuid_re.match(match.group(1)):
        return "*uuid*"
    return "*string*"
    

# list of (match,replacement) tuples used by clean_query()
replacers = [
    # Disabling comment removal because we may put useful inspection info in there
    #(re.compile(r'(?:--|#)[^\'"\r\n]*(?=[\r\n]|\Z)',re.I),""), # one-line comments
    #(re.compile(r"/\*[^!].*?\*/",re.I|re.M|re.S),""), # But not /*!version */

    (re.compile(r"\\\\"),""), # remove backslash pairs that may confuse the next line    
    (re.compile(r"\\[\"']"),""), # remove escaped quotes
    
    (re.compile(r'"([^"]*)"',re.I),string_replace), # quoted strings
    (re.compile(r"'([^']*)'",re.I),string_replace), # quoted strings
    
    # this next one may need more work, due to "UPDATE ... SET money = money-23"
    # the next two are significantly different from the maatkit original code
    (re.compile(r"(?<![\w\)\d])(\s*)\-\d+(\.\d+)?",re.I),"*num*"), # negative reals
    (re.compile(r"(?<![\w])\d+(\.\d+)?",re.I),"*num*"), # positive reals
    # mk-query-digest has s/[xb.+-]\?/?/g; as "clean up leftovers" here, whatever that means - I've left it out
    
    (re.compile(r"^\s+",re.I),""), # chop off leading whitespace
    (re.compile(r"\s+$",re.I|re.M|re.S),""), # kill trailing whitespace
    
    # reduce IN and VALUES lists (look for previously-cleaned placeholders)
    (re.compile(r"\b(in|values)(?:[\s,]*\(([\s\,]*\*(num|string|uuid)\*)*[\s,]*\))+",
                re.I|re.X),"\\1(*values*)"), # collapse IN and VALUES lists
    
    # This next one collapses chains of UNIONed functionally-identical queries,
    # but it's only really useful if you're regularly seeing more than 2 queries
    # in a chain. We don't seem to have any like that, so I'm disabling this.
    #(re.compile(r"\b(select\s.*?)(?:(\sunion(?:\sall)?)\s\1)+",re.I),"\\1 -- repeat\\2 --"), # collapse UNION
    
    # remove "OFFSET *num*" when following a LIMIT
    (re.compile(r"\blimit \*num\*(?:, ?\*num\*| offset \*num\*)?",re.I),"LIMIT *num*")
]

prepare_re = re.compile('PREPARE.*', re.IGNORECASE)
deallocate_re = re.compile('DEALLOCATE\s+PREPARE.*', re.IGNORECASE)
execute_re = re.compile('EXECUTE.*', re.IGNORECASE)
mdb_re = re.compile('MDB2_STATEMENT\S+')

def clean_query(query, num_words):
    "Generalizes a query by removing all unique information"
    # Strip carriage returns
    query = query.replace("\n", " ")

    # Screw it, if it's a prepared statement or an execute, generalize the statement name
    if prepare_re.match(query):
        query = mdb_re.sub('*statement*', query)
        return query
    if execute_re.match(query):
        query = mdb_re.sub('*statement*', query)
    if deallocate_re.match(query):
        query = "DEALLOCATE PREPARE"
        return query

    # Loop through the replacers and perform each one
    for (replacer, subst) in replacers:
        # try block is here because, apparently, string_re may throw an exception
        # TODO: investigate the above
        try:
            query = replacer.sub(subst, query)
        except:
            pass

    # After we do the cleanup, then we get rid of extra whitespace
    words = query.split(None)
    query = " ".join(words)    
    return query

def test_clean_query(query):
    "A debug version of the query cleaner which prints steps as it goes"

    # Strip carriage returns
    query = query.replace("\n", " ")

    # Screw it, if it's a prepared statement or an execute, generalize the statement name
    if prepare_re.match(query):
        query = mdb_re.sub('*statement*', query)
        return query
    if execute_re.match(query):
        query = mdb_re.sub('*statement*', query)
    if deallocate_re.match(query):
        query = "DEALLOCATE PREPARE"
        return query

    # Loop through the replacers and perform each one
    for (replacer, subst) in replacers:
        try:
            if replacer.search(query) == None:
                print replacer.pattern," : No match"
            else:
                query = replacer.sub(subst, query)
                print replacer.pattern," : ",query
        except:
            pass

    # After we do the cleanup, then we get rid of extra whitespace
    words = query.split(None)
    query = " ".join(words)    
    return query


#
# Hostname cache - basically, caches the "linden" host type for a particular IP address
# or hostname
#
sim_re = re.compile(".*sim\d+.*")
web_re = re.compile("int\.web\d+.*")
iweb_re = re.compile("int\.iweb\d+.*")
webds_re = re.compile(".*web-ds\d+.*")
webster_re = re.compile(".*webster\d+.*")
bankds_re = re.compile(".*bank-ds\d+.*")
xmlrpc_re = re.compile(".*xmlrpc\d+.*")
login_re = re.compile(".*login\d+.*")
data_re = re.compile(".*data\..*")
#xmlrpc_re = re.compile("(?:int\.omgiwanna.*)|(?:int\.pony.*)")
ip_re = re.compile("\d+\.\d+\.\d+\.\d+")
ll_re = re.compile("(.*)\.lindenlab\.com")

host_type_cache = {}
def get_host_type(host):
    "Returns the genericized linden host type from an IP address or hostname"
#    if host in host_type_cache:
#        return host_type_cache[host]

    named_host = str(host)
    if ip_re.match(host):
        # Look up the hostname
        try:
            named_host = str(socket.gethostbyaddr(host)[0])
        except:
            pass

    # Figure out generic host type
    host_type = named_host
    if sim_re.match(named_host):
        host_type = "sim"
    elif login_re.match(named_host):
        host_type = "login"
    elif webster_re.match(named_host):
        host_type = "webster"
    elif bankds_re.match(named_host):
        host_type = "bank-ds"
    elif web_re.match(named_host):
        host_type = "web"
    elif iweb_re.match(named_host):
        host_type = "iweb"
    elif webds_re.match(named_host):
        host_type = "web-ds"
    elif data_re.match(named_host):
        host_type = "data"
    elif xmlrpc_re.match(named_host):
        host_type = "xmlrpc"
    m = ll_re.match(host_type)
    if m:
        host_type = m.group(1)
    host_type_cache[host] = host_type
    return (host_type, named_host)


def LLLogIter(filenames):
    "An iterator that iterates line by line over a series of files, even if they're compressed."
    for f in filenames:
        curr = open_log_file(f)
        for line in curr:
            yield line

            
def open_log_file(filename):
    # Open the logfile (even if it's compressed)
    if re.compile(".+\.gz").match(filename):
        # gzipped file, return a gzipped file opject
        return gzip.open(filename,"r")
    else:
        return open(filename, "r")
Summary ✨

This Python code appears to be part of a log file analysis tool for Linden Lab’s servers, likely used for debugging and monitoring purposes. It provides functions for cleaning up log data, caching host types, and iterating over compressed log files. The code also includes utility functions for opening log files and getting the genericized host type from an IP address or hostname.
Tech Fingerprint

Alerts (73)

'def' Ensure functions have docstrings for documentation
46 74 82 101 201 205 211 215 220 235 263 276 297 318 325 340 349 371 395 410 414 428 448 452 465 475 478 481 485 511 525 534 559 570 619 627 635 643 649 671 723 732 736 757 953 957 1016 1025 1073
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
94 593 598 1035
'open(' Use 'with open()' to ensure Files are properly closed
431 555 561 606
Complexity hotspot; lines 492 to 498 (total complexity: 7)
492 493 494 495 496 497 498
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
647 734 748 754 955
'global' Avoid global variables; use function parameters or class attributes for better scope management
706 717
'time.sleep(' Avoid blocking; use threading.Timer or asyncio.sleep for non-blocking delays
1045
'print(' Use logging module for better control and configurability
1061