PageRenderTime 298ms CodeModel.GetById 80ms app.highlight 166ms RepoModel.GetById 36ms app.codeStats 0ms

/mysql_watcher/dblibs/dbutil.py

https://bitbucket.org/lindenlab/apiary/
Python | 1253 lines | 1219 code | 5 blank | 29 comment | 16 complexity | 3345fa671f7633ad7462092c7cba8b87 MD5 | raw file
   1#!/usr/bin/env python
   2
   3#
   4# $LicenseInfo:firstyear=2007&license=mit$
   5# 
   6# Copyright (c) 2007-2010, Linden Research, Inc.
   7# 
   8# Permission is hereby granted, free of charge, to any person obtaining a copy
   9# of this software and associated documentation files (the "Software"), to deal
  10# in the Software without restriction, including without limitation the rights
  11# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12# copies of the Software, and to permit persons to whom the Software is
  13# furnished to do so, subject to the following conditions:
  14# 
  15# The above copyright notice and this permission notice shall be included in
  16# all copies or substantial portions of the Software.
  17# 
  18# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24# THE SOFTWARE.
  25# $/LicenseInfo$
  26#
  27
  28
  29#
  30# Utility classes that allow us to monitor and keep track of databases
  31#
  32import array
  33import binascii
  34import gzip
  35import math
  36import os
  37import re
  38import socket
  39import string
  40import struct
  41import sys
  42import time
  43
  44from llbase import llsd
  45
  46def asciify(str):
  47    "Lame ASCIIfication of a string to keep various things from barfing"
  48    out_str = ""
  49    for ch in str:
  50        if (ch >= chr(0x9)) and (ch <= '~'):
  51            out_str += ch
  52        else:
  53            out_str += "."
  54    return out_str
  55
  56def all_as_maps(cursor):
  57    """Return all of the cursor with maps for each row instead of sequences"""
  58    all_seq = cursor.fetchall()
  59    ret_all = []
  60    descs = cursor.description
  61    for row in all_seq:
  62        new_row = {}
  63        count = 0
  64        for desc in descs:
  65            new_row[desc[0]] = row[count]
  66            count += 1
  67        ret_all.append(new_row)
  68    return ret_all
  69
  70#
  71# Cache IP to string lookup to make it faster
  72#
  73ip_table = {}
  74def lookup_ip_string(ip_bin):
  75    if not ip_bin in ip_table:
  76        ip_table[ip_bin] = "%d.%d.%d.%d" % ((ip_bin & 0xff000000L) >> 24,
  77                                            (ip_bin & 0x00ff0000L) >> 16,
  78                                            (ip_bin & 0x0000ff00L) >> 8,
  79                                            ip_bin & 0x000000ffL)
  80    return ip_table[ip_bin]
  81
  82def llquery_from_llsd(query_llsd):
  83    # Hack, fill in arbitary data for info that isn't serialized
  84    query = LLQuery(None, None, query_llsd['query'], 0.0)
  85    query.mData['host_clean'] = query_llsd['host_clean']
  86    query.mData['query_clean'] = query_llsd['query_clean']
  87
  88    # Hack, keeps correctOutliers from trashing the data
  89    #query.mNumQueries = query_llsd['num_queries']
  90    #query.mTotalTime = query_llsd['total_time']
  91    try:
  92        query.mNumQueriesCorrected = query_llsd['num_queries_corrected']
  93        query.mTotalTimeCorrected = query_llsd['total_time_corrected']
  94    except:
  95        # Hack for old output which didn't generate this data
  96        query.mNumQueriesCorrected = query_llsd['num_queries']
  97        query.mTotalTimeCorrected = query_llsd['total_time']
  98        
  99    return query
 100
 101def get_query_tables(query):
 102    "Return the list of tables in a query"
 103    #
 104    # Really dumb method, literally iterates through a bunch of regular expressions to pull this out.
 105    # There are probably better methods out there.
 106    #
 107    
 108    out_tables = []
 109    # Clean up the query
 110    query = query.replace('\n',' ')
 111    query = re.sub('\s+', ' ', query)
 112    
 113    m = LLQuery.sSelectWhereRE.match(query)
 114    if m:
 115        # Split apart by commas
 116        tables = m.group(1).split(',')
 117        for table in tables:
 118            # Take the first part (which is table name)
 119            out_tables.append(string.strip(table.split()[0]))
 120        return out_tables
 121    
 122    m = LLQuery.sSelectRE.match(query)
 123    if m:
 124        out_tables.append(string.strip(m.group(1)))
 125        return out_tables
 126        
 127    m = LLQuery.sUpdateRE.match(query)
 128    if m:
 129        # Split apart by commas
 130        tables = m.group(1).split(',')
 131        for table in tables:
 132            # Take the first part (which is table name)
 133            out_tables.append(string.strip(table.split()[0]))
 134        return out_tables
 135
 136    m = LLQuery.sReplaceRE.match(query)
 137    if m:
 138        out_tables.append(string.strip(m.group(1)))
 139        return out_tables
 140    
 141    m = LLQuery.sInsertRE.match(query)
 142    if m:
 143        out_tables.append(string.strip(m.group(1)))
 144        return out_tables
 145
 146    m = LLQuery.sDeleteRE.match(query)
 147    if m:
 148        out_tables.append(string.strip(m.group(1)))
 149        return out_tables
 150    return out_tables
 151
 152
 153MIN_BIN=-15
 154MAX_BIN=10
 155class LLQuery:
 156    "Represents all of the data associated with a query"
 157    fromLLSDStats = staticmethod(llquery_from_llsd)
 158    def __init__(self, host, port, query, start_time):
 159        # Store information which will be serialized for metadata in a map
 160        self.mData = {}
 161        self.mData['host'] = host
 162        self.mData['port'] = port
 163        self.mData['query'] = query
 164
 165        # Metadata
 166        self.mData['host_clean'] = None
 167        self.mData['host_full'] = None
 168        self.mData['query_clean'] = None
 169        self.mData['tables'] = []
 170
 171        #
 172        # Stats information
 173        #
 174        self.mNumQueries = 0
 175        self.mTotalTime = 0.0
 176        self.mOutQueries = 0
 177        self.mTotalTimeCorrected = 0.0 # Corrected to remove outliers
 178        self.mNumQueriesCorrected = 0 # Corrected to remove outliers
 179
 180        # LLQueryStatBins for the query time histogram, as well as corrected time
 181        # Query times are collected into bins based on power of 2 execution times (in seconds).
 182        # Each bin collects the number of queries and total execution time. See LLQueryStatBin
 183        # for more details
 184        self.mBins = {} # Bins for histogram
 185
 186        # This stuff doesn't usually get serialized
 187        self.mQueryLen = len(query)
 188        self.mStartTime = start_time
 189        self.mResponseTime = start_time
 190
 191    def __hash__(self):
 192        return (self.mData['host_clean'] + ":" + self.mData['query_clean']).__hash__()
 193
 194    def __eq__(self, other):
 195        # Note, this matches on clean, not strictly correct
 196        if ((self.mData['query_clean'] == other.mData['query_clean']) and
 197            (self.mData['host_clean'] == other.mData['host_clean'])):
 198            return True
 199        return False
 200
 201    def getKey(self):
 202        # The string key is just the clean host and query, concatenated
 203        return self.mData['host_clean'] + ":" + self.mData['query_clean']
 204        
 205    def clean(self):
 206        "Generate the clean query so it can be used for statistics"
 207        if not self.mData['host_clean']:
 208            (self.mData['host_clean'], self.mData['host_full']) = get_host_type(self.mData['host'])
 209            self.mData['query_clean'] = clean_query(self.mData['query'], 0)
 210
 211    def getAvgTimeCorrected(self):
 212        "Average time per query, corrected for outliers"
 213        return self.mTotalTimeCorrected/self.mNumQueriesCorrected
 214
 215    def queryStart(self):
 216        "When collecting query stats, use this when the query is receieved"
 217        self.mNumQueries += 1
 218        self.mOutQueries += 1
 219
 220    def queryResponse(self, elapsed):
 221        "When collecting stats, use this when the response is received"
 222        self.mTotalTime += elapsed
 223        self.mOutQueries -=1
 224
 225        # Determine which stat bin this query is in
 226        bin = MIN_BIN
 227        if elapsed:
 228            bin = int(math.log(elapsed,2))
 229        bin = max(MIN_BIN, bin)
 230        bin = min(MAX_BIN, bin)
 231        if bin not in self.mBins:
 232            self.mBins[bin] = LLQueryStatBin(bin)
 233        self.mBins[bin].accumulate(elapsed)
 234
 235    def correctOutliers(self):
 236        "Find outliers bins and calculate corrected results"
 237        # Outlier bins have query counts which are 3 orders of magnitude less than the total count for that query
 238        if not self.mNumQueries:
 239            # FIXME: This is a hack because we don't save this information in the query count dump
 240            return
 241        min_queries = self.mNumQueries/100
 242        self.mTotalTimeCorrected = 0.0
 243        self.mNumQueriesCorrected = 0
 244        for i in self.mBins.keys():
 245            if self.mBins[i].mNumQueries < min_queries:
 246                # Outlier, flag as such.
 247                self.mBins[i].mOutlier = True
 248            else:
 249                self.mTotalTimeCorrected += self.mBins[i].mTotalTime
 250                self.mNumQueriesCorrected += self.mBins[i].mNumQueries
 251        if self.mNumQueriesCorrected == 0:
 252            #HACK: Deal with divide by zero
 253            self.mNumQueriesCorrected = 1
 254
 255    # Miscellaneous regular expressions to analyze the query type
 256    sReadRE = re.compile("(SELECT.*)|(USE.*)", re.IGNORECASE)
 257    sSelectWhereRE = re.compile("\(?\s*?SELECT.+?FROM\s+\(?(.*?)\)?\s+WHERE.*", re.IGNORECASE)
 258    sSelectRE = re.compile("\(?\s*?SELECT.+?FROM\s+(.+)(?:\s+LIMIT.*|.*)", re.IGNORECASE)
 259    sUpdateRE = re.compile("UPDATE\s+(.+?)\s+SET.*", re.IGNORECASE)
 260    sReplaceRE = re.compile("REPLACE INTO\s+(.+?)(?:\s*\(|\s+SET).*", re.IGNORECASE)
 261    sInsertRE = re.compile("INSERT.+?INTO\s+(.+?)(?:\s*\(|\s+SET).*", re.IGNORECASE)
 262    sDeleteRE = re.compile("DELETE.+?FROM\s+(.+?)\s+WHERE.*", re.IGNORECASE)
 263    def analyze(self):
 264        "Does some query analysis on the query"
 265        query = self.mData['query_clean']
 266        self.mData['tables'] = get_query_tables(query)
 267        if 'type' in self.mData:
 268            # Already analyzed
 269            return
 270        if LLQuery.sReadRE.match(query):
 271            self.mData['type'] = 'read'
 272        else:
 273            self.mData['type'] = 'write'
 274
 275
 276    def dumpLine(self, elapsed, query_len = 0):
 277        "Dump a semi-human-readable stats line for reporting"
 278        bin_str = ''
 279        for i in range(MIN_BIN,MAX_BIN+1):
 280            if i in self.mBins:
 281                if self.mBins[i].mOutlier:
 282                    bin_str += '*'
 283                else:
 284                    bin_str += str(int(math.log10(self.mBins[i].mNumQueries)))
 285            else:
 286                bin_str += '.'
 287        if not query_len:
 288            query_len = 4096
 289        num_queries = self.mNumQueriesCorrected
 290        if not num_queries:
 291            num_queries = 1
 292        return ("%s\t%5d\t%6.2f\t%6.2f\t%1.4f\t%s\t" % (bin_str, num_queries,
 293                                                       num_queries/elapsed, self.mTotalTimeCorrected,
 294                                                       self.mTotalTimeCorrected/num_queries, self.mData['host_clean'])) \
 295                                                       + self.mData['query_clean'][0:query_len]
 296
 297    def as_map(self):
 298        "Make an LLSD map version of data that can be used for merging"
 299        self.analyze()
 300        self.mData['num_queries'] = self.mNumQueries
 301        self.mData['total_time'] = self.mTotalTime
 302        self.mData['num_queries_corrected'] = self.mNumQueriesCorrected
 303        self.mData['total_time_corrected'] = self.mTotalTimeCorrected
 304        return self.mData
 305
 306class LLConnStatus:
 307    "Keeps track of the status of a connection talking to mysql"
 308    def __init__(self, ip_port, start_time):
 309        self.mLastMysqlPacketNumber = 0
 310        self.mNumPackets = 0
 311        self.mIPPort = ip_port
 312        self.mStartTime = start_time
 313        self.mLastUpdate = start_time
 314        self.mCurState = ""
 315        self.mLastQuery = None
 316        self.mNumQueries = 0
 317
 318    def quit(self, src_ip, src_port, pkt_time):
 319        query = LLQuery(src_ip, src_port, "Quit", pkt_time)
 320        query.clean()
 321        self.mLastUpdate = pkt_time
 322        self.mLastQuery = query
 323        self.mNumPackets += 1
 324
 325    def queryStart(self, src_ip, src_port, pkt_time, raw, pkt_len, offset):
 326        query_len = pkt_len - 1
 327        query = LLQuery(src_ip, src_port, raw[offset:offset + (pkt_len - 1)], pkt_time)
 328        self.mLastUpdate = pkt_time
 329        # Packet length includes the command, offset into raw doesn't
 330        if query_len > (len(raw) - offset):
 331            query.mQueryLen = query_len
 332            self.mCurState = "SendingQuery"
 333        else:
 334            self.mCurState = "QuerySent"
 335            query.clean()
 336        self.mNumQueries += 1
 337        self.mLastQuery = query
 338        self.mNumPackets += 1
 339
 340    def queryStartProcessed(self, src_ip, src_port, pkt_time, query_str):
 341        query = LLQuery(src_ip, src_port, query_str, pkt_time)
 342        query.clean()
 343        self.mLastUpdate = pkt_time
 344        self.mCurState = "QuerySent"
 345        self.mNumQueries += 1
 346        self.mLastQuery = query
 347        self.mNumPackets += 1
 348
 349    def updateNonCommand(self, pkt_time, raw):
 350        # Clean up an existing query if you get a non-command.
 351        self.mNumPackets += 1
 352        self.mLastUpdate = pkt_time
 353        if self.mLastQuery:
 354            if self.mCurState == "SendingQuery":
 355                # We're continuing a query
 356                # We won't generate a new clean version, because it'll $!@# up all the sorting.
 357                self.mLastQuery.mData['query'] += raw
 358                if len(self.mLastQuery.mData['query']) == self.mLastQuery.mQueryLen:
 359                    self.mCurState = "QuerySent"
 360                    self.mLastQuery.clean()
 361                return
 362            else:
 363                #
 364                # A non-command that's continuing a query. Not sure why this is happening,
 365                # but clear the last query to avoid generating inadvertent long query results.
 366                #
 367                self.mLastQuery = None
 368        # Default to setting state to "NonCommand"
 369        self.mCurState = "NonCommand"
 370
 371    def updateResponse(self, pkt_time, result_type):
 372        # If we've got a query running, accumulate the elapsed time
 373        start_query_response = False
 374        if self.mCurState == "QuerySent":
 375            lq = self.mLastQuery
 376            if lq:
 377                if lq.mStartTime == 0.0:
 378                    lq.mStartTime = pkt_time
 379                lq.mResponseTime = pkt_time
 380                start_query_response = True
 381
 382        self.mLastUpdate = pkt_time
 383        if result_type == 0:
 384            self.mCurState = "Result:RecvOK"
 385        elif result_type == 0xff:
 386            self.mCurState = "Result:Error"
 387        elif result_type == 0xfe:
 388            self.mCurState = "Result:EOF"
 389        elif result_type == 0x01:
 390            self.mCurState = "Result:Header"
 391        else:
 392            self.mCurState = "Result:Data"
 393        return start_query_response
 394
 395    def dump(self):
 396        if self.mLastQuery:
 397            print "%s: NumQ: %d State:%s\n\tLast: %s" % (self.mIPPort, self.mNumQueries, self.mCurState,
 398                                                         self.mLastQuery.mData['query_clean'][0:40])
 399        else:
 400            print "%s: NumQ: %d State:%s\n\tLast: None" % (self.mIPPort, self.mNumQueries, self.mCurState)
 401
 402class LLQueryStatBin:
 403    "Keeps track of statistics for one query bin"
 404    def __init__(self, power):
 405        self.mMinTime = pow(2, power)
 406        self.mMaxTime = pow(2, power+1)
 407        self.mTotalTime = 0
 408        self.mNumQueries = 0
 409        self.mOutlier = False
 410    def accumulate(self, elapsed):
 411        self.mTotalTime += elapsed
 412        self.mNumQueries += 1
 413
 414def dump_query_stat_header():
 415    return "LogHistogram (-15:10)     \tCount\tQPS\tTotal\tAvg\tHost\tQuery"
 416
 417
 418class LLQueryStatMap:
 419    def __init__(self, description, start_time):
 420        self.mDescription = description
 421        self.mQueryMap = {}
 422        self.mStartTime = start_time
 423        self.mFinalTime = 0
 424        self.mLastTime = self.mStartTime
 425        self.mQueryStartCount = 0
 426        self.mQueryResponseCount = 0
 427
 428    def load(self, fn):
 429        "Load dumped query stats from an LLSD file"
 430        # Read in metadata
 431        in_file = open(fn)
 432        in_string = in_file.read()
 433        in_file.close()
 434        in_llsd = llsd.LLSD.parse(in_string)
 435        info = in_llsd[0]
 436        query_list = in_llsd[1]
 437        self.mDescription = info['description']
 438        self.mStartTime = info['start_time']
 439        self.mLastTime = info['last_time']
 440        self.mFinalTime = info['last_time']
 441        self.mQueryStartCount = info['query_start_count']
 442        self.mQueryResponseCount = info['query_response_count']
 443        # Iterate through all the queries, and populate the query map.
 444        for query_row in query_list:
 445            query = LLQuery.fromLLSDStats(query_row)
 446            self.mQueryMap[query.getKey()] = query
 447
 448    def analyze(self):
 449        for query in self.mQueryMap.values():
 450            query.analyze()
 451
 452    def queryStart(self, query):
 453        if not query in self.mQueryMap:
 454            #query.analyze()
 455            self.mQueryMap[query] = query
 456        self.mQueryMap[query].queryStart()
 457        # Update elapsed time for this map
 458        self.mLastTime = query.mStartTime
 459        if self.mLastTime < self.mStartTime:
 460            self.mStartTime = self.mLastTime
 461        if self.mLastTime > self.mFinalTime:
 462            self.mFinalTime = self.mLastTime
 463        self.mQueryStartCount += 1
 464        
 465    def queryResponse(self, query):
 466        if not query in self.mQueryMap:
 467            self.queryStart(query)
 468        elapsed = query.mResponseTime - query.mStartTime
 469        self.mQueryMap[query].queryResponse(elapsed)
 470        self.mLastTime = query.mResponseTime
 471        if self.mLastTime > self.mFinalTime:
 472            self.mFinalTime = self.mLastTime
 473        self.mQueryResponseCount += 1
 474
 475    def getElapsedTime(self):
 476        return self.mFinalTime - self.mStartTime
 477
 478    def getQPS(self):
 479        return self.mQueryStartCount / self.getElapsedTime()
 480
 481    def correctOutliers(self):
 482        for query in self.mQueryMap.values():
 483            query.correctOutliers()
 484
 485    def getSortedKeys(self, sort_by = "total_time"):
 486        "Gets a list of keys sorted by sort type"
 487        self.correctOutliers()
 488        
 489        items = self.mQueryMap.items()
 490        backitems = None
 491
 492        if sort_by == "total_time":
 493            backitems = [[v[1].mTotalTimeCorrected, v[0]] for v in items]
 494        elif sort_by == "count":
 495            backitems = [[v[1].mNumQueriesCorrected, v[0]] for v in items]
 496        elif sort_by == "avg_time":
 497            backitems = [[v[1].getAvgTimeCorrected(), v[0]] for v in items]
 498        else:
 499            # Fallback, sort by total time
 500            backitems = [[v[1].mTotalTimeCorrected, v[0]] for v in items]
 501
 502        backitems.sort()
 503        backitems.reverse()
 504
 505        # Get the keys out of the items
 506        sorted = []
 507        for pair in backitems:
 508            sorted.append(pair[1])
 509        return sorted
 510
 511    def getSortedStats(self, sort_by = "total_time", num_stats = 0):
 512        "Gets a list of the top queries according to sort type"
 513        sorted_keys = self.getSortedKeys(sort_by)
 514
 515        if num_stats == 0:
 516            l = len(sorted_keys)
 517        else:
 518            l = min(num_stats, len(sorted_keys))
 519
 520        stats = []
 521        for i in range(0, l):
 522            stats.append(self.mQueryMap[sorted_keys[i]])
 523        return stats
 524
 525    def dumpStatus(self, sort_type = "total_time", elapsed = None):
 526        # Dump status according to total time
 527        if not elapsed:
 528            elapsed = self.getElapsedTime()
 529
 530        sorted_stats = self.getSortedStats(sort_type)
 531        for query in sorted_stats:
 532            print query.dumpLine(elapsed, 60)
 533
 534    def dumpLLSD(self, filename):
 535        # Analyze queries to generate metadata
 536        self.analyze()
 537        # Dump an LLSD document representing the entire object
 538        out = []
 539
 540        # First, dump all the metadata into the first block
 541        info_map = {}
 542        info_map['description'] = self.mDescription
 543        info_map['start_time'] = self.mStartTime
 544        info_map['last_time'] = self.mLastTime
 545        info_map['query_start_count'] = self.mQueryStartCount
 546        info_map['query_response_count'] = self.mQueryResponseCount
 547        out.append(info_map)
 548
 549        # Dump all of the query info into the second block
 550        sorted_stats = self.getSortedStats("total_time")
 551        query_list = []
 552        for query in sorted_stats:
 553            query_list.append(query.as_map())
 554        out.append(query_list)
 555        f = open(filename, "w")
 556        f.write(str(llsd.LLSD(out)))
 557        f.close()
 558
 559    def dumpTiming(self, filename):
 560        cur_time = time.time()
 561        f = open(filename, "w")
 562        f.write(dump_query_stat_header() + "\n")
 563        # Sort the queries
 564        sorted_stats = self.getSortedStats("total_time")
 565        for query in sorted_stats:
 566            f.write(query.dumpLine(cur_time - self.mStartTime))
 567            f.write("\n")
 568        f.close()
 569
 570    def dumpCountsLLSD(self, filename):
 571        "Dump the query statistics as an LLSD doc, for later merging with the query_info doc"
 572
 573        out = []
 574        # Put the metadata into a map
 575        info_map = {}
 576        info_map['description'] = self.mDescription
 577        info_map['start_time'] = self.mStartTime
 578        info_map['last_time'] = self.mLastTime
 579        info_map['query_start_count'] = self.mQueryStartCount
 580        info_map['query_response_count'] = self.mQueryResponseCount
 581        out.append(info_map)
 582
 583        sorted_stats = self.getSortedStats("total_time")
 584        query_list = []
 585        for query in sorted_stats:
 586            query_row = {}
 587            # We only want to dump identifying info and stats, not metadata
 588            query_row['host_clean'] = query.mData['host_clean']
 589            # Convert the queries to utf-8 to make sure it doesn't break XML
 590            try:
 591                u = unicode(query.mData['query_clean'])
 592                query_row['query_clean'] = u.encode('utf-8')
 593            except:
 594                query_row['query_clean'] = 'NON-UTF8'
 595            try:
 596                u = unicode(query.mData['query'])
 597                query_row['query'] = u.encode('utf-8')
 598            except:
 599                query_row['query'] = 'NON-UTF8'
 600            query_row['count'] = query.mNumQueriesCorrected
 601            query_row['total_time'] = query.mTotalTimeCorrected
 602            query_row['avg_time'] = query.getAvgTimeCorrected()
 603            query_list.append(query_row)
 604
 605        out.append(query_list)
 606        f = open(filename, "w")
 607        f.write(str(llsd.LLSD(out)))
 608        f.close()
 609
 610
 611class LLBinnedQueryStats:
 612    "Keeps track of a fixed number of N minute bins of query stats"
 613    def __init__(self):
 614        self.mHourBins = {} # This will be keyed by unixtime seconds, eventually
 615        self.mMinuteBins = {}
 616        self.mLastUpdateHour = 0
 617        self.mLastUpdateMinute = 0
 618
 619    def dumpTiming(self, path):
 620        # Dump hour bins
 621        for (key, value) in self.mHourBins.items():
 622            value.dumpTiming("%s/hour-%s-query_timing.txt" % (path, key))
 623        # Dump minute bins
 624        for (key, value) in self.mMinuteBins.items():
 625            value.dumpTiming("%s/minute-%s-query_timing.txt" % (path, key))
 626
 627    def dumpCountsLLSD(self, path):
 628        # Dump hour bins
 629        for (key, value) in self.mHourBins.items():
 630            value.dumpCountsLLSD("%s/hour-%s-query_counts.llsd" % (path, key))
 631        # Dump minute bins
 632        for (key, value) in self.mMinuteBins.items():
 633            value.dumpCountsLLSD("%s/minute-%s-query_counts.llsd" % (path, key))
 634
 635    def dumpLLSD(self, path):
 636        # Dump hour bins
 637        for (key, value) in self.mHourBins.items():
 638            value.dumpLLSD("%s/hour-%s-query_dump.llsd" % (path, key))
 639        # Dump minute bins
 640        for (key, value) in self.mMinuteBins.items():
 641            value.dumpLLSD("%s/minute-%s-query_dump.llsd" % (path, key))
 642
 643    def flushOldBins(self, time_secs):
 644        for minute_bin_str in self.mMinuteBins.keys():
 645            bin_secs = time.mktime(time.strptime(minute_bin_str, "%Y-%m-%d-%H-%M"))
 646            if (time_secs - bin_secs) > 3*3600:
 647                del self.mMinuteBins[minute_bin_str]
 648
 649    def queryStart(self, query):
 650        "Update associated bin for the time specified, creating if necessary"
 651        # Hour and minute bins
 652        t = time.localtime(query.mStartTime)
 653        hour_bin_str = time.strftime("%Y-%m-%d-%H", t)
 654        minute_bin_str = time.strftime("%Y-%m-%d-%H-%M", t)
 655        hour = t[3]
 656        minute = t[4]
 657        # FIXME: These start times are a bit inaccurate, but should be fine under heavy query load.
 658        if not hour_bin_str in self.mHourBins:
 659            self.mHourBins[hour_bin_str] = LLQueryStatMap(hour_bin_str, query.mStartTime)
 660        if not minute_bin_str in self.mMinuteBins:
 661            self.mMinuteBins[minute_bin_str] = LLQueryStatMap(minute_bin_str, query.mStartTime)
 662
 663        self.mHourBins[hour_bin_str].queryStart(query)
 664        self.mMinuteBins[minute_bin_str].queryStart(query)
 665
 666        if hour != self.mLastUpdateHour:
 667            self.mLastUpdateHour = hour
 668            # If the hour changes, dump and clean out old bins
 669            self.flushOldBins(query.mStartTime)
 670
 671    def queryResponse(self, query):
 672        "Update associated bin for the time specified, creating if necessary"
 673        # Hour and minute bins
 674        t = time.localtime(query.mStartTime)
 675        hour_bin_str = time.strftime("%Y-%m-%d-%H", t)
 676        minute_bin_str = time.strftime("%Y-%m-%d-%H-%M", t)
 677        hour = t[3]
 678        minute = t[4]
 679        # FIXME: These start times are a bit inaccurate, but should be fine under heavy query load.
 680        if not hour_bin_str in self.mHourBins:
 681            self.mHourBins[hour_bin_str] = LLQueryStatMap(hour_bin_str, query.mStartTime)
 682        if not minute_bin_str in self.mMinuteBins:
 683            self.mMinuteBins[minute_bin_str] = LLQueryStatMap(hour_bin_str, query.mStartTime)
 684            
 685        self.mHourBins[hour_bin_str].queryResponse(query)
 686        self.mMinuteBins[minute_bin_str].queryResponse(query)
 687        
 688
 689# MySQL protocol sniffer, using tcpdump, ncap packet parsing and mysql internals
 690# http://forge.mysql.com/wiki/MySQL_Internals_ClientServer_Protocol
 691class LLQueryStream:
 692    "Process a raw tcpdump stream (in raw libpcap format)"
 693    def __init__(self, in_file):
 694        self.mInFile = in_file
 695        self.mStartTime = time.time()
 696
 697        #
 698        # A list of all outstanding "connections", and what they're doing.
 699        # This is necessary in order to get script timing and other information.
 700        #
 701        self.mConnStatus = {}
 702        self.mConnKeys = []
 703        self.mConnCleanupIndex = 0
 704
 705        #
 706        # Parse/skip past the libpcap global header
 707        #
 708        
 709        #guint32 magic_number;   /* magic number */
 710        #guint16 version_major;  /* major version number */
 711        #guint16 version_minor;  /* minor version number */
 712        #gint32  thiszone;       /* GMT to local correction */
 713        #guint32 sigfigs;        /* accuracy of timestamps */
 714        #guint32 snaplen;        /* max length of captured packets, in octets */
 715        #guint32 network;        /* data link type */
 716
 717        # Skip past the libpcap global header
 718        format = 'IHHiIII'
 719        size = struct.calcsize(format)
 720        header_bin = self.mInFile.read(size)
 721        res = struct.unpack(format, header_bin)
 722
 723    def createConnection(self, client_ip_port, pkt_time):
 724        # Track the connection, create a new one or return existing
 725        if not client_ip_port in self.mConnStatus:
 726            self.mConnStatus[client_ip_port] = LLConnStatus(client_ip_port, pkt_time)
 727            # Track a new key that we need to garbage collect
 728            self.mConnKeys.append(client_ip_port)
 729        conn = self.mConnStatus[client_ip_port]
 730        return conn
 731    
 732    def closeConnection(self, ip_port):
 733        if ip_port in self.mConnStatus:
 734            del self.mConnStatus[ip_port]
 735
 736    def cleanupConnection(self,cur_time):
 737        # Cleanup some number of stale connections.
 738        CONNECTION_EXPIRY=900.0
 739        if self.mConnCleanupIndex >= len(self.mConnKeys):
 740            self.mConnCleanupIndex = 0
 741            # Skip if no keys
 742            if len(self.mConnKeys) == 0:
 743                return
 744        key = self.mConnKeys[self.mConnCleanupIndex]
 745        if key in self.mConnStatus:
 746            # Clean up if it's too old
 747            if self.mConnStatus[key].mLastUpdate < (cur_time - CONNECTION_EXPIRY):
 748                del self.mConnStatus[key]
 749                #print "Cleaning up old key:", key
 750                #print "num conns:", len(self.mConnStatus)
 751                #print "num keys", len(self.mConnKeys)
 752        else:
 753            # Clean up if the connection is already removed
 754            del self.mConnKeys[self.mConnCleanupIndex]
 755        self.mConnCleanupIndex += 1
 756
 757    def getNextEvent(self):
 758        # Get the next event out of the packet stream
 759
 760        td_format = 'IIII'
 761        ip_format = '!BBHHHBBHII'
 762        tcp_format = '!HHIIBBHHH'
 763        while 1:
 764            #
 765            # Parse out an individual packet from the tcpdump stream
 766            #
 767            # Match the packet header
 768
 769            # Pull a record (packet) off of the wire
 770
 771            # Packet header
 772            # guint32 ts_sec;         /* timestamp seconds */
 773            # guint32 ts_usec;        /* timestamp microseconds */
 774            # guint32 incl_len;       /* number of octets of packet saved in file */
 775            # guint32 orig_len;       /* actual length of packet */
 776            ph_bin = self.mInFile.read(16)
 777            res = struct.unpack(td_format, ph_bin)
 778            ts_sec = res[0]
 779            ts_usec = res[1]
 780            pkt_time = ts_sec + (ts_usec/1000000.0)
 781            incl_len = res[2]
 782            orig_len = res[3]
 783
 784            # Packet data (incl_len bytes)
 785            raw_data = self.mInFile.read(incl_len)
 786
 787            # Parse out the MAC header
 788            # Don't bother, we don't care - 14 byte header
 789            mac_offset = 14
 790
 791            # Parse out the IP header (min 20 bytes)
 792            # 4 bits - version
 793            # 4 bits - header length in 32 bit words
 794            # 1 byte - type of service
 795            # 2 bytes - total length
 796            # 2 bytes - fragment identification
 797            # 3 bits - flags
 798            # 13 bits - fragment offset
 799            # 1 byte - TTL
 800            # 1 byte - Protocol (should be 6)
 801            # 2 bytes - header checksum
 802            # 4 bytes - source IP
 803            # 4 bytes - dest IP
 804            
 805            ip_header = struct.unpack(ip_format, raw_data[mac_offset:mac_offset + 20])
 806
 807            # Assume all packets are TCP
 808            #if ip_header[6] != 6:
 809            #    print "Not TCP!"
 810            #    continue
 811            
 812            src_ip_bin = ip_header[8]
 813            src_ip = lookup_ip_string(src_ip_bin)
 814            #src_ip = "%d.%d.%d.%d" % ((src_ip_bin & 0xff000000L) >> 24,
 815            #                          (src_ip_bin & 0x00ff0000L) >> 16,
 816            #                          (src_ip_bin & 0x0000ff00L) >> 8,
 817            #                          src_ip_bin & 0x000000ffL)
 818            dst_ip_bin = ip_header[9]
 819            dst_ip = lookup_ip_string(dst_ip_bin)
 820            #dst_ip = "%d.%d.%d.%d" % ((dst_ip_bin & 0xff000000L) >> 24,
 821            #                          (dst_ip_bin & 0x00ff0000L) >> 16,
 822            #                          (dst_ip_bin & 0x0000ff00L) >> 8,
 823            #                          dst_ip_bin & 0x000000ffL)
 824            
 825            ip_size = (ip_header[0] & 0x0f) * 4
 826            
 827
 828            # Parse out the TCP packet header
 829            # 2 bytes - src_prt
 830            # 2 bytes - dst_port
 831            # 4 bytes - sequence number
 832            # 4 bytes - ack number
 833            # 4 bits - data offset (size in 32 bit words of header
 834            # 6 bits - reserved
 835            # 6 bits - control bits
 836            # 2 bytes - window
 837            # 2 bytes - checksum
 838            # 2 bytes - urgent pointer
 839
 840            tcp_offset = mac_offset + ip_size
 841            tcp_header = struct.unpack(tcp_format, raw_data[tcp_offset:tcp_offset+20])
 842            tcp_size = ((tcp_header[4] & 0xf0) >> 4) * 4
 843
 844            src_port = tcp_header[0]
 845            dst_port = tcp_header[1]
 846
 847            # 3 bytes - packet length
 848            # 1 byte - packet number
 849            # 1 byte - command
 850            # <n bytes> - args
 851            pkt_offset = tcp_offset + tcp_size
 852
 853            if len(raw_data) == pkt_offset:
 854                continue
 855
 856            # Clearly not a mysql packet if it's less than 5 bytes of data
 857            if len(raw_data) - pkt_offset < 5:
 858                continue
 859
 860            src_ip_port = "%s:%d" % (src_ip, src_port)
 861            dst_ip_port = "%s:%d" % (dst_ip, dst_port)
 862
 863            if src_port == 3306:
 864                #
 865                # We are processing traffic from mysql server -> client
 866                # This primarily is used to time how long it takes for use
 867                # to start receiving data to the client from the server.
 868                #
 869                mysql_arr = array.array('B', raw_data[pkt_offset])
 870                result_type = ord(raw_data[pkt_offset])
 871
 872                # Get or create connection
 873                conn = self.createConnection(dst_ip_port, pkt_time)
 874
 875                # Update the status of this connection, including query times on
 876                # connections
 877                if conn.updateResponse(pkt_time, result_type):
 878                    # Event: Initial query response
 879                    return "QueryResponse", conn.mLastQuery
 880                continue
 881            if dst_port == 3306:
 882                #
 883                # Processing a packet from the client to the server
 884                #
 885
 886                # HACK! This is an easy place to put this where we can get packet time that only happens once or so per event.
 887                # Garbage collect connections
 888                self.cleanupConnection(pkt_time)
 889
 890                # Pull out packet length from the header
 891                mysql_arr = array.array('B', raw_data[pkt_offset:pkt_offset+5])
 892                pkt_len = mysql_arr[0] + (long(mysql_arr[1]) << 8) + (long(mysql_arr[2]) << 16)
 893
 894                pkt_number = mysql_arr[3]
 895
 896                # Find the connection associated with this packet
 897                
 898                # Get or create connection
 899                conn = self.createConnection(src_ip_port, pkt_time)
 900
 901                #if conn.mLastMysqlPacketNumber != (pkt_number - 1):
 902                #    print "Prev:", conn.mLastMysqlPacketNumber, "Cur:", pkt_number
 903                conn.mLastMysqlPacketNumber = pkt_number
 904                
 905                cmd = mysql_arr[4]
 906                # If we're not a command, do stuff
 907                if cmd > 0x1c:
 908                    # Unfortunately, we can't trivially tell the difference between
 909                    # various non-command packets
 910                    # Assume that these are all AuthResponses for now.
 911
 912                    conn.updateNonCommand(pkt_time, raw_data[pkt_offset:])
 913                    if "QuerySent" == conn.mCurState:
 914                        return ("QueryStart", conn.mLastQuery)
 915                    continue
 916
 917                query = None
 918
 919                if cmd == 1:
 920                    # Event: Quitting a connection
 921                    conn.quit(src_ip, src_port, pkt_time)
 922                    # This connection is closing, get rid of it
 923                    self.closeConnection(src_ip_port)
 924                    return ("Quit", conn.mLastQuery)
 925                elif cmd == 3:
 926                    # Event: Starting a query
 927                    conn.queryStart(src_ip, src_port, pkt_time, raw_data, pkt_len, pkt_offset + 5)
 928
 929                    # Only return an QueryStart if we have the whole query
 930                    if "QuerySent" == conn.mCurState:
 931                        return ("QueryStart", conn.mLastQuery)
 932                else:
 933                    pass
 934
 935IP_PORT_RE = re.compile("(\S+):(\d+)")
 936EVENT_RE = re.compile("(\S+)\t(\S+):(\d+)\t(\S+)\t(\S+)")
 937SECTION_RE = re.compile("\*{38}")
 938
 939
 940class LLLogQueryStream:
 941    "Process a query stream dump to generate a query stream class"
 942    "Process a raw tcpdump stream (in raw libpcap format)"
 943    def __init__(self, lineiter):
 944        self.mLineIter = lineiter
 945        self.mStartTime = None
 946
 947        #
 948        # A list of all outstanding "connections", and what they're doing.
 949        # This is necessary in order to get script timing and other information.
 950        #
 951        self.mConnStatus = {}
 952
 953    def closeConnection(self, ip_port):
 954        if ip_port in self.mConnStatus:
 955            del self.mConnStatus[ip_port]
 956
 957    def getNextEvent(self):
 958        # Get the next event out of the file
 959        cur_event = None
 960        event_time = None
 961        event_type = None
 962        ip = None
 963        port = None
 964        ip_port = None
 965        cur_state = 'Metadata'
 966        for line in self.mLineIter:
 967            if line == '':
 968                return (None, None)
 969            if cur_state == 'Metadata':
 970                # We're looking for an event.  Actually we better find one.
 971                m = EVENT_RE.match(line)
 972                if not m:
 973                    #raise "Missing event on line: %s" % line
 974                    continue
 975                else:
 976                    event_time = float(m.group(1))
 977                    ip = m.group(2)
 978                    port = int(m.group(3))
 979                    ip_port = m.group(2)+":"+m.group(3)
 980                    clean_host = m.group(4)
 981                    event_type = m.group(5)
 982                    query_str = ''
 983                    cur_state = 'Query'
 984            elif cur_state == 'Query':
 985                if not SECTION_RE.match(line):
 986                    query_str += line
 987                else:
 988                    # We're done
 989                    # Generate the event to return
 990                    # Track the connection if we don't know about it yet.
 991                    conn = self.createConnection(ip_port, event_time)
 992
 993                    if event_type == 'QueryStart':
 994                        conn.queryStartProcessed(ip, port, event_time, query_str)
 995                        return ("QueryStart", conn.mLastQuery)
 996                    elif event_type == 'QueryResponse':
 997                        # Update the status of this connection, including query times on
 998                        # connections
 999                        # Hack: Result type defaults to zero
1000                        if conn.updateResponse(event_time, 0):
1001                            # Event: Initial query response
1002                            return ("QueryResponse", conn.mLastQuery)
1003                        else:
1004                            # Skip responses which we don't have the start for
1005                            cur_state = 'Metadata'
1006                    elif event_type == 'Quit':
1007                        # Event: Quitting a connection
1008                        conn.quit(ip, port, event_time)
1009                        # This connection is closing, get rid of it
1010                        self.closeConnection(ip_port)
1011                        return ("Quit", conn.mLastQuery)
1012                    else:
1013                        raise ("Unknown event type %s" % event_type)
1014        return (None, None)
1015
1016def start_dump(host, port):
1017    # Start up tcpdump pushing data into netcat on the sql server
1018    interface = "eth0"
1019    
1020    # Start up tcpdump pushing data into netcat on the sql server
1021    SRC_DUMP_CMD = "ssh root@%s '/usr/sbin/tcpdump -p -n -s 0 -w - -i %s dst port 3306 or src port 3306 | nc %s %d'" \
1022                   % (host, interface, socket.getfqdn(), port)
1023    os.popen2(SRC_DUMP_CMD, "r")
1024
1025def remote_mysql_stream(host):
1026    # Create a server socket, then have tcpdump dump stuff to it.
1027    serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1028
1029    bound = False
1030    port = 9999
1031    while not bound:
1032        try:
1033            serversocket.bind((socket.gethostname(), port))
1034            bound = True
1035        except:
1036            print port, " already bound, trying again"
1037            port += 1
1038    print "Bound port %d" % port
1039    serversocket.listen(1)
1040
1041    # Fork off the dumper, start the server on the main connection
1042    pid = os.fork()
1043    if not pid:
1044        # Child process which gets data from the database
1045        time.sleep(1.0)
1046        print "Starting dump!"
1047        start_dump(host, port)
1048        print "Exiting dump!"
1049        sys.exit(0)
1050
1051    print "Starting server"
1052    (clientsocket, address) = serversocket.accept()
1053    print "Accepted connection", address
1054
1055    # Start listening to the data stream
1056    return clientsocket.makefile("rb")
1057
1058#
1059# Utility stuff for query cleaner
1060#
1061# This is a Python port of (part of) the fingerprint() function from
1062# the mk-query-digest script in Maatkit, added by Yoz, with various additions/tweaks
1063
1064hex_wildcard = r"[0-9a-f]"
1065word = hex_wildcard + r"{4}-"
1066long_word = hex_wildcard + r"{8}-"
1067very_long_word = hex_wildcard + r"{12}"
1068UUID_REGEX_STRING = long_word + word + word + word + very_long_word
1069
1070hex_re = re.compile("^[\da-f]+$",re.I)
1071uuid_re = re.compile("^"+UUID_REGEX_STRING+"$",re.I)
1072
1073def string_replace(match):
1074    "Called by string-matching regexp in replacers"
1075    if uuid_re.match(match.group(1)):
1076        return "*uuid*"
1077    return "*string*"
1078    
1079
1080# list of (match,replacement) tuples used by clean_query()
1081replacers = [
1082    # Disabling comment removal because we may put useful inspection info in there
1083    #(re.compile(r'(?:--|#)[^\'"\r\n]*(?=[\r\n]|\Z)',re.I),""), # one-line comments
1084    #(re.compile(r"/\*[^!].*?\*/",re.I|re.M|re.S),""), # But not /*!version */
1085
1086    (re.compile(r"\\\\"),""), # remove backslash pairs that may confuse the next line    
1087    (re.compile(r"\\[\"']"),""), # remove escaped quotes
1088    
1089    (re.compile(r'"([^"]*)"',re.I),string_replace), # quoted strings
1090    (re.compile(r"'([^']*)'",re.I),string_replace), # quoted strings
1091    
1092    # this next one may need more work, due to "UPDATE ... SET money = money-23"
1093    # the next two are significantly different from the maatkit original code
1094    (re.compile(r"(?<![\w\)\d])(\s*)\-\d+(\.\d+)?",re.I),"*num*"), # negative reals
1095    (re.compile(r"(?<![\w])\d+(\.\d+)?",re.I),"*num*"), # positive reals
1096    # mk-query-digest has s/[xb.+-]\?/?/g; as "clean up leftovers" here, whatever that means - I've left it out
1097    
1098    (re.compile(r"^\s+",re.I),""), # chop off leading whitespace
1099    (re.compile(r"\s+$",re.I|re.M|re.S),""), # kill trailing whitespace
1100    
1101    # reduce IN and VALUES lists (look for previously-cleaned placeholders)
1102    (re.compile(r"\b(in|values)(?:[\s,]*\(([\s\,]*\*(num|string|uuid)\*)*[\s,]*\))+",
1103                re.I|re.X),"\\1(*values*)"), # collapse IN and VALUES lists
1104    
1105    # This next one collapses chains of UNIONed functionally-identical queries,
1106    # but it's only really useful if you're regularly seeing more than 2 queries
1107    # in a chain. We don't seem to have any like that, so I'm disabling this.
1108    #(re.compile(r"\b(select\s.*?)(?:(\sunion(?:\sall)?)\s\1)+",re.I),"\\1 -- repeat\\2 --"), # collapse UNION
1109    
1110    # remove "OFFSET *num*" when following a LIMIT
1111    (re.compile(r"\blimit \*num\*(?:, ?\*num\*| offset \*num\*)?",re.I),"LIMIT *num*")
1112]
1113
1114prepare_re = re.compile('PREPARE.*', re.IGNORECASE)
1115deallocate_re = re.compile('DEALLOCATE\s+PREPARE.*', re.IGNORECASE)
1116execute_re = re.compile('EXECUTE.*', re.IGNORECASE)
1117mdb_re = re.compile('MDB2_STATEMENT\S+')
1118
1119def clean_query(query, num_words):
1120    "Generalizes a query by removing all unique information"
1121    # Strip carriage returns
1122    query = query.replace("\n", " ")
1123
1124    # Screw it, if it's a prepared statement or an execute, generalize the statement name
1125    if prepare_re.match(query):
1126        query = mdb_re.sub('*statement*', query)
1127        return query
1128    if execute_re.match(query):
1129        query = mdb_re.sub('*statement*', query)
1130    if deallocate_re.match(query):
1131        query = "DEALLOCATE PREPARE"
1132        return query
1133
1134    # Loop through the replacers and perform each one
1135    for (replacer, subst) in replacers:
1136        # try block is here because, apparently, string_re may throw an exception
1137        # TODO: investigate the above
1138        try:
1139            query = replacer.sub(subst, query)
1140        except:
1141            pass
1142
1143    # After we do the cleanup, then we get rid of extra whitespace
1144    words = query.split(None)
1145    query = " ".join(words)    
1146    return query
1147
1148def test_clean_query(query):
1149    "A debug version of the query cleaner which prints steps as it goes"
1150
1151    # Strip carriage returns
1152    query = query.replace("\n", " ")
1153
1154    # Screw it, if it's a prepared statement or an execute, generalize the statement name
1155    if prepare_re.match(query):
1156        query = mdb_re.sub('*statement*', query)
1157        return query
1158    if execute_re.match(query):
1159        query = mdb_re.sub('*statement*', query)
1160    if deallocate_re.match(query):
1161        query = "DEALLOCATE PREPARE"
1162        return query
1163
1164    # Loop through the replacers and perform each one
1165    for (replacer, subst) in replacers:
1166        try:
1167            if replacer.search(query) == None:
1168                print replacer.pattern," : No match"
1169            else:
1170                query = replacer.sub(subst, query)
1171                print replacer.pattern," : ",query
1172        except:
1173            pass
1174
1175    # After we do the cleanup, then we get rid of extra whitespace
1176    words = query.split(None)
1177    query = " ".join(words)    
1178    return query
1179
1180
1181#
1182# Hostname cache - basically, caches the "linden" host type for a particular IP address
1183# or hostname
1184#
1185sim_re = re.compile(".*sim\d+.*")
1186web_re = re.compile("int\.web\d+.*")
1187iweb_re = re.compile("int\.iweb\d+.*")
1188webds_re = re.compile(".*web-ds\d+.*")
1189webster_re = re.compile(".*webster\d+.*")
1190bankds_re = re.compile(".*bank-ds\d+.*")
1191xmlrpc_re = re.compile(".*xmlrpc\d+.*")
1192login_re = re.compile(".*login\d+.*")
1193data_re = re.compile(".*data\..*")
1194#xmlrpc_re = re.compile("(?:int\.omgiwanna.*)|(?:int\.pony.*)")
1195ip_re = re.compile("\d+\.\d+\.\d+\.\d+")
1196ll_re = re.compile("(.*)\.lindenlab\.com")
1197
1198host_type_cache = {}
1199def get_host_type(host):
1200    "Returns the genericized linden host type from an IP address or hostname"
1201#    if host in host_type_cache:
1202#        return host_type_cache[host]
1203
1204    named_host = str(host)
1205    if ip_re.match(host):
1206        # Look up the hostname
1207        try:
1208            named_host = str(socket.gethostbyaddr(host)[0])
1209        except:
1210            pass
1211
1212    # Figure out generic host type
1213    host_type = named_host
1214    if sim_re.match(named_host):
1215        host_type = "sim"
1216    elif login_re.match(named_host):
1217        host_type = "login"
1218    elif webster_re.match(named_host):
1219        host_type = "webster"
1220    elif bankds_re.match(named_host):
1221        host_type = "bank-ds"
1222    elif web_re.match(named_host):
1223        host_type = "web"
1224    elif iweb_re.match(named_host):
1225        host_type = "iweb"
1226    elif webds_re.match(named_host):
1227        host_type = "web-ds"
1228    elif data_re.match(named_host):
1229        host_type = "data"
1230    elif xmlrpc_re.match(named_host):
1231        host_type = "xmlrpc"
1232    m = ll_re.match(host_type)
1233    if m:
1234        host_type = m.group(1)
1235    host_type_cache[host] = host_type
1236    return (host_type, named_host)
1237
1238
1239def LLLogIter(filenames):
1240    "An iterator that iterates line by line over a series of files, even if they're compressed."
1241    for f in filenames:
1242        curr = open_log_file(f)
1243        for line in curr:
1244            yield line
1245
1246            
1247def open_log_file(filename):
1248    # Open the logfile (even if it's compressed)
1249    if re.compile(".+\.gz").match(filename):
1250        # gzipped file, return a gzipped file opject
1251        return gzip.open(filename,"r")
1252    else:
1253        return open(filename, "r")