PageRenderTime 187ms CodeModel.GetById 90ms app.highlight 56ms RepoModel.GetById 15ms app.codeStats 0ms

/apiary/mysql/sqllog.py

https://bitbucket.org/lindenlab/apiary/
Python | 569 lines | 528 code | 15 blank | 26 comment | 15 complexity | a509721b9db000773fb883de1dd21d2f MD5 | raw file
  1#
  2# $LicenseInfo:firstyear=2010&license=mit$
  3# 
  4# Copyright (c) 2010, Linden Research, Inc.
  5# 
  6# Permission is hereby granted, free of charge, to any person obtaining a copy
  7# of this software and associated documentation files (the "Software"), to deal
  8# in the Software without restriction, including without limitation the rights
  9# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10# copies of the Software, and to permit persons to whom the Software is
 11# furnished to do so, subject to the following conditions:
 12# 
 13# The above copyright notice and this permission notice shall be included in
 14# all copies or substantial portions of the Software.
 15# 
 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22# THE SOFTWARE.
 23# $/LicenseInfo$
 24#
 25
 26import heapq
 27import re
 28import sys
 29import time
 30import cPickle
 31
 32from apiary.tools.debug import *
 33from apiary.tools import mergetools
 34from apiary.tools import timestamp
 35
 36__all__ = [
 37        'Event',
 38        'CoalescedEvent',
 39        'Sequence',
 40        'parse_stanza',
 41        'EventReader',
 42        'input_events',
 43        'FollowSequences',
 44        'CoalesceSequences'
 45    ]
 46
 47headerRE = re.compile(r'^(?P<time>\d+\.\d+)\t(?P<id>[\d.:]+)\t?(?P<source>\S*)\t(?P<state>\w+)$')
 48breakRE = re.compile(r'^\*{3,}$')
 49commentRE = re.compile(r'^\-{2,}.*$')
 50timeRE = re.compile(r'^# Time: (\d+ [\d\w:.]+)$')
 51clientRE = re.compile(r'^# Client: ([\d.:]+)$')
 52threadRE = re.compile(r'# Thread_id: (\d+)$')
 53admin_commandRE = re.compile(r'^# administrator command: (\w+);$')
 54query_log_commentRE = re.compile(r'^#')
 55
 56class EventFile(object):
 57    def __init__(self, name):
 58        if name == "-":
 59            self._file = sys.stdin
 60        else:
 61            self._file = open(name, 'r')
 62            
 63        self._lines = []
 64    
 65    def readline(self):
 66        if self._lines:
 67            return self._lines.pop()
 68        else:
 69            return self._file.readline()
 70    
 71    def unreadline(self, line):
 72        self._lines.append(line)
 73
 74class Event(object):
 75    # state values
 76    Query = 'QueryStart'
 77    Response = 'QueryResponse'
 78    End = 'Quit'
 79    
 80    #@traced_method
 81    def __init__(self, time, id, source, state, body):
 82        self.time = time
 83        if type(time) is not timestamp.TimeStamp:
 84            self.time = timestamp.TimeStamp(time)
 85        self.id = id
 86        self.source = source
 87        self.state = state
 88        self.body = body
 89    
 90    def __cmp__(self, other):
 91        c = cmp(self.time, other.time)
 92        if c == 0:
 93            if self.state == "Quit" and other.state != "Quit":
 94                return 1
 95            if self.state != "Quit" and other.state == "Quit":
 96                return -1
 97        return c
 98        
 99    def __str__(self):
100        return ("%s\t%s\t%s\t%s\n%s\n**************************************\n"
101            % (self.time, self.id, self.source, self.state, self.body))
102    
103    def events(self):
104        es = []
105        for event in self.body.split("\n+++\n"):
106            parts = event.split(":", 1)
107            if len(parts) == 2:
108                (time, body) = parts
109                status = Event.Query
110                if body == "Quit":
111                    status = Event.End
112                es.append(Event(time, self.id, self.source, status, body))
113        return es
114
115class CoalescedEvent(Event):
116    Sequence = 'Sequence'
117    
118    def __init__(self, shelf_life=None, max_life=None):
119        self.time = None
120        self.id = None
121        self.source = None
122        self.state = CoalescedEvent.Sequence
123        self.body = ""
124        self.ended = False
125        self.lasttime = None
126        self.staletime = None
127        self.maxtime = None
128        self.shelf_life = None
129        if shelf_life:
130            self.shelf_life = timestamp.TimeStamp(shelf_life)
131        self.max_life = None
132        if max_life:
133            self.max_life = timestamp.TimeStamp(max_life)
134    
135    def add(self, event):
136        if self.time is None:
137            self.time = event.time
138            if self.max_life:
139                self.maxtime = self.time + self.max_life
140            self.id = event.id
141            self.source = event.source
142        self.lasttime = event.time
143        if self.shelf_life:
144            self.staletime = self.lasttime + self.shelf_life
145        if event.state == Event.End:
146            self.body += "%s:Quit\n+++\n" % (event.time)
147            self.ended = True
148        elif event.state == Event.Query:
149            self.body += "%s:%s\n+++\n" % (event.time, event.body) 
150        # Ignore Event.QueryResponse, because we only care about queries sent.
151    
152    def endIfNeeded(self):
153        if not self.ended:
154            self.add(Event(self.lasttime, self.id, self.source,
155                            Event.End, 'Quit'))
156
157
158#@traced_func
159def parse_stanza(input):
160    
161    # This function is a bit gnarly because it can handle either the output of
162    # mk-query-digest or the output of mysql_watcher/mysql_logger.
163    #
164    # mk-query-digest parsing code borrowed from Dante Linden.
165    
166    seconds = 0
167    id = ''
168    source = ''
169    state = Event.Query
170    admin_command = ''
171    line = ''
172    body = ''
173    match = None
174    while not match and not query_log_commentRE.match(line):
175        line = input.readline()
176        if line == '':  # empty string means EOF
177            return None
178        if commentRE.match(line): # catch comments before the headers
179            line = input.readline() # Skip the line
180        # parse the header in case it's a seq log
181        match = headerRE.match(line)
182    
183    if match:
184        seconds = match.group('time')
185        id = match.group('id')
186        source = match.group('source') or ''
187        state = match.group('state')
188        
189        line = input.readline()
190
191    # if processing a mk-query-digest query log, extract info from comments
192    while query_log_commentRE.match(line):
193        if timeRE.match(line):
194            # if seconds, then you've hit another query in a digest log because
195            # the previous query had no body (i.e., the comments from one query
196            # border the comments for the next query)
197            if seconds:
198                # seek backward so we can process this timestamp as part of the
199                # next stanza
200                input.unreadline(line)
201                return Event(float(seconds), id, source, state, body)
202
203            timestamp = timeRE.match(line).groups()[0]
204            # convert timestamp into seconds since epoch:
205            # strip off subseconds, convert remainder to seconds, 
206            # append subseconds
207            parts = timestamp.split('.')
208            if len(parts) == 2:
209                date_time, subseconds = timestamp.split('.')
210            else:
211                date_time = timestamp
212                subseconds = 0
213            seconds = str(int(time.mktime(time.strptime(date_time, "%y%m%d %H:%M:%S")))) + ".%s" % subseconds
214        if clientRE.match(line):
215            id = clientRE.match(line).groups()[0]
216        if threadRE.match(line):
217            id += ':%s' % threadRE.match(line).groups(0)
218        if admin_commandRE.match(line):
219            admin_command = admin_commandRE.match(line).groups()[0]
220            if admin_command == "Quit":
221                state = Event.End
222    
223        line = input.readline()
224        if line == '':
225            return None
226
227        
228    # we should be to the body of the stanza now 
229    while True:
230        while commentRE.match(line):
231            line = input.readline() # Skip the line
232        if line == '': # EOF
233            break
234        if breakRE.match(line):
235            break
236        if query_log_commentRE.match(line):
237            break
238        body += line
239        line = input.readline()
240
241    # any admin commands follow the body
242    if admin_commandRE.match(line):
243        admin_command = admin_commandRE.match(line).groups()[0]
244        if admin_command == "Quit":
245            state = Event.End
246    else:
247        # the last line we read was a comment.  seek backwards
248        # so that we see it the next time we read from input
249        input.unreadline(line)
250    #print "seconds=%s, id=%s, body=%s" % (seconds, id, body)
251    return Event(float(seconds), id, source, state, body)
252    
253class Sequence(object):
254    def __init__(self):
255        self._count = 0
256        self._time_start = None
257        self._last_event = None
258    
259    def note(self, event):
260        self._count += 1
261        if self._time_start is None:
262            self._time_start = event.time
263        self._last_event = event
264    
265    def count(self):
266        return self._count
267    
268    def time(self):
269        return self._last_event.time - self._time_start
270    
271    def timeto(self, event):
272        if self._last_event is None:
273            return None
274        return event.time - self._last_event.time
275        
276    def ended(self):
277        return self._last_event.state == Event.End
278        
279    def generateEnd(self, t=None):
280        e = self._last_event
281        if t is None:
282            t = e.time
283        return Event(t, e.id, e.source, Event.End, "")
284
285
286class RawEventReader(object):
287    def __init__(self, input):
288        self._input = input
289    
290    def __iter__(self):
291        while True:
292            s = parse_stanza(self._input)
293            if s is None:
294                return
295            yield s
296
297class EventReader(object):
298    def __init__(self, input):
299        self._input = input
300    
301    def __iter__(self):
302        while True:
303            s = parse_stanza(self._input)
304            if s is None:
305                return
306            if s.state == CoalescedEvent.Sequence:
307                for t in s.events():
308                    yield t
309            else:
310                yield s
311                
312#@traced_func
313def input_events(specs):
314    if len(specs) == 0:
315        specs = ['-']
316    evs = [EventReader(EventFile(spec)) for spec in specs]
317    return mergetools.imerge(*evs)
318
319def split_events(specs, dest_prefix, num_splits):
320    if len(specs) == 0:
321        specs = ['-']
322
323    dests = [open(dest_prefix + str(n), 'w') for n in xrange(num_splits)]
324    
325    evs = [EventReader(EventFile(spec)) for spec in specs]
326    merged = mergetools.imerge(*evs)
327    
328    start_time = time.time()
329    
330    for num, event in enumerate(merged):
331        if num % 10000 == 0:
332            elapsed = time.time() - start_time
333            print "split %d events in %s seconds (%.2f events/sec)..." % (num, elapsed, float(num) / float(elapsed))
334        
335        dests[num % num_splits].write(str(event))
336
337def pickle_events(specs, dest):
338    if len(specs) == 0:
339        specs = ['-']
340    
341    if not isinstance(dest, file):
342        dest = open(dest, 'w')
343    
344    evs = [EventReader(EventFile(spec)) for spec in specs]
345    merged = mergetools.imerge(*evs)
346    
347    start_time = time.time()
348    
349    for num, event in enumerate(merged):
350        if num % 10000 == 0:
351            elapsed = time.time() - start_time
352            print "pickled %d events in %s seconds (%.2f events/sec)..." % (num, elapsed, float(num) / float(elapsed))
353        
354        if event.state == CoalescedEvent.Sequence:
355            for subevent in event.events():
356                cPickle.dump(subevent, file=dest)
357        else:
358            cPickle.dump(event, file=dest)
359
360
361def input_pickled_events(specs):
362    for spec in specs:
363        sequence_file = open(spec)
364        
365        while True:
366            try:
367                event = cPickle.load(sequence_file)
368                if event.state == CoalescedEvent.Sequence:
369                    for subevent in event.events():
370                        yield subevent
371                else:
372                    yield event
373            except EOFError:
374                break
375        
376class FollowSequences(object):
377    def replay(self, events):
378        connections = { }
379        lastt = None;
380        for e in events:
381            id = e.id
382            lastt = e.time
383            if id not in connections:
384                s = connections[id] = Sequence()
385                self.addingSequence(s, e)
386            else:
387                s = connections[id]
388            self.notingEvent(s, e)
389            s.note(e)
390            if s.ended():
391                self.removingSequence(s, e)
392                del connections[id]
393            if False:
394                expired = []
395                for (id,s) in connections.iteritems():
396                    w = s.timeto(e)
397                    if w and float(w) > 60.0:
398                        expired.append((id,s))
399                for (id,s) in expired:
400                    f = s.generateEnd(e.time)
401                    self.forcedEnd(s, f)
402                    self.removingSequence(s, f)
403                    del connections[id]
404        for s in connections.itervalues():
405            f = s.generateEnd(lastt)
406            self.forcedEnd(s, f)
407            self.removingSequence(s, f)
408            
409    def addingSequence(self, s, e):
410        pass
411    
412    def notingEvent(self, s, e):
413        pass
414    
415    def forcedEnd(self, s, e):
416        pass
417    
418    def removingSequence(self, s, e):
419        pass
420
421
422class CoalesceSequences(object):
423    def __init__(self):
424        self.connections = { }
425        self.bytime = [ ]
426        self.starttime = None
427        self.lasttime = None
428        
429    def heartbeat(self, n):
430        if self.bytime:
431            sys.stderr.write("front of the queue = %s\n" % self.bytime[0].id)
432        sys.stderr.write("%s: %d events... (%d connections, %d waiting)\n"
433            % (str(self.lasttime - self.starttime), n, len(self.connections), len(self.bytime)))
434            
435        # The stuff below summarizes the queue in this format:
436        # : <item> : <item> : <item>
437        # Where <item> is one of:
438        #   connection id(time since last message seen)
439        #   -- n -- where n is a number of ended connections just waiting to be printed
440        #
441        # Up to the first 5 connections found will be printed, along with the gaps of ended connections
442        # between them.
443        
444        n = 0
445        i = 0
446        l = len(self.bytime)
447        s = ""
448        while n < 5 and i < l:
449            en = 0
450            while i < l and self.bytime[i].ended:
451                en += 1
452                i += 1
453            if en > 0:
454                s += " : --%d--" % en
455            else:
456                n += 1
457                s += " : %s(%s)" % (self.bytime[i].id, str(self.lasttime - self.bytime[i].lasttime))
458                i += 1
459        sys.stderr.write("                          ")
460        sys.stderr.write(s)
461        sys.stderr.write("\n")
462
463    def age_out(self, c):
464        if c.staletime and self.lasttime >= c.staletime:
465            sys.stderr.write("                           expiring %s, stale\n" % c.id)
466        elif c.maxtime and self.lasttime >= c.maxtime:
467            sys.stderr.write("                           expiring %s, maxed out\n" % c.id)
468        else:
469            return False
470        c.endIfNeeded()
471        del self.connections[c.id]
472        return True
473        
474    def flush_completed(self):
475        """Print query sequences that have completed.
476        
477        Query sequences are always printed in order of the time the
478        sequence started.  Sequence endings are sometimes not present
479        in the event stream, in which case we must wait until the
480        sequence times out before printing it out.  No sequences after
481        the "stuck" sequence will be printed until it times out.
482        """
483        
484        bytime = self.bytime
485        
486        while bytime:
487            c = bytime[0]
488            if not c.ended:
489                if not self.age_out(c):
490                    return
491            heapq.heappop(bytime)
492            self.fullSequence(c)
493            
494    def flush_all(self):
495        """Flush all sequences.
496        
497        Sequences are flushed even if no end event has been seen and 
498        they have not timed out yet.
499        """
500        
501        while self.bytime:
502            c = heapq.heappop(self.bytime)
503            c.endIfNeeded()
504            self.fullSequence(c)
505
506    def replay(self, events):
507        """Correlate an interleaved query stream into sequences of queries."""
508        
509        # n = number of events seen
510        n = 0
511        
512        # s = number of sequences seen
513        s = 0
514        
515        # self.connections tracks open connections for which we have not seen an end event.
516        connections = self.connections
517        
518        # bytime contains all queries that have not yet been printed as 
519        # sequences.  It is a min-heap ordered by time, so that the 
520        # earliest event is always first in the list.
521        bytime = self.bytime
522        
523        for e in events:
524            id = e.id
525            self.lasttime = e.time
526            if self.starttime is None:
527                self.starttime = self.lasttime
528
529            n += 1
530            if n % 10000 == 0:
531                # Print stats every 10000 events.
532                self.heartbeat(n)
533            
534            # If this connection is already in the lsit of open connections, 
535            # see if it's stale or too old.  Sometimes the query stream doesn't
536            # contain the End event for a given connection, so we need to time
537            # it out.
538            if id in connections:
539                c = connections[id]
540                self.age_out(c)
541
542            # At this point, the connection may have been aged out and removed from
543            # self.connections.  Otherwise, this could be the first message seen
544            # for this connection.  In either case, make a new connection.
545            if id not in connections:
546                s += 1
547                c = connections[id] = CoalescedEvent(300.0, 900.0)
548                c.add(e)
549                heapq.heappush(bytime, c)
550            else:
551                c.add(e)
552
553            # Check if the connection is closing.  If so, remove it from 
554            # self.connections, but don't print it out yet.  Events must
555            # be printed in order.
556            if e.state == Event.End:
557                del connections[id]
558
559            self.flush_completed()
560
561        for d in connections.itervalues():
562            d.endIfNeeded()
563        self.flush_all()
564        
565        print >> sys.stderr, "%d events processed; %d sequences produced" % (n, s)
566                    
567    def fullSequence(self, e):
568        raise NotImplemented("fullSequence() should be implemented by a child class")
569