PageRenderTime 42ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/apiary/mysql/sqllog.py

https://bitbucket.org/lindenlab/apiary/
Python | 569 lines | 528 code | 15 blank | 26 comment | 13 complexity | a509721b9db000773fb883de1dd21d2f MD5 | raw file
  1. #
  2. # $LicenseInfo:firstyear=2010&license=mit$
  3. #
  4. # Copyright (c) 2010, Linden Research, Inc.
  5. #
  6. # Permission is hereby granted, free of charge, to any person obtaining a copy
  7. # of this software and associated documentation files (the "Software"), to deal
  8. # in the Software without restriction, including without limitation the rights
  9. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. # copies of the Software, and to permit persons to whom the Software is
  11. # furnished to do so, subject to the following conditions:
  12. #
  13. # The above copyright notice and this permission notice shall be included in
  14. # all copies or substantial portions of the Software.
  15. #
  16. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22. # THE SOFTWARE.
  23. # $/LicenseInfo$
  24. #
  25. import heapq
  26. import re
  27. import sys
  28. import time
  29. import cPickle
  30. from apiary.tools.debug import *
  31. from apiary.tools import mergetools
  32. from apiary.tools import timestamp
  33. __all__ = [
  34. 'Event',
  35. 'CoalescedEvent',
  36. 'Sequence',
  37. 'parse_stanza',
  38. 'EventReader',
  39. 'input_events',
  40. 'FollowSequences',
  41. 'CoalesceSequences'
  42. ]
  43. headerRE = re.compile(r'^(?P<time>\d+\.\d+)\t(?P<id>[\d.:]+)\t?(?P<source>\S*)\t(?P<state>\w+)$')
  44. breakRE = re.compile(r'^\*{3,}$')
  45. commentRE = re.compile(r'^\-{2,}.*$')
  46. timeRE = re.compile(r'^# Time: (\d+ [\d\w:.]+)$')
  47. clientRE = re.compile(r'^# Client: ([\d.:]+)$')
  48. threadRE = re.compile(r'# Thread_id: (\d+)$')
  49. admin_commandRE = re.compile(r'^# administrator command: (\w+);$')
  50. query_log_commentRE = re.compile(r'^#')
  51. class EventFile(object):
  52. def __init__(self, name):
  53. if name == "-":
  54. self._file = sys.stdin
  55. else:
  56. self._file = open(name, 'r')
  57. self._lines = []
  58. def readline(self):
  59. if self._lines:
  60. return self._lines.pop()
  61. else:
  62. return self._file.readline()
  63. def unreadline(self, line):
  64. self._lines.append(line)
  65. class Event(object):
  66. # state values
  67. Query = 'QueryStart'
  68. Response = 'QueryResponse'
  69. End = 'Quit'
  70. #@traced_method
  71. def __init__(self, time, id, source, state, body):
  72. self.time = time
  73. if type(time) is not timestamp.TimeStamp:
  74. self.time = timestamp.TimeStamp(time)
  75. self.id = id
  76. self.source = source
  77. self.state = state
  78. self.body = body
  79. def __cmp__(self, other):
  80. c = cmp(self.time, other.time)
  81. if c == 0:
  82. if self.state == "Quit" and other.state != "Quit":
  83. return 1
  84. if self.state != "Quit" and other.state == "Quit":
  85. return -1
  86. return c
  87. def __str__(self):
  88. return ("%s\t%s\t%s\t%s\n%s\n**************************************\n"
  89. % (self.time, self.id, self.source, self.state, self.body))
  90. def events(self):
  91. es = []
  92. for event in self.body.split("\n+++\n"):
  93. parts = event.split(":", 1)
  94. if len(parts) == 2:
  95. (time, body) = parts
  96. status = Event.Query
  97. if body == "Quit":
  98. status = Event.End
  99. es.append(Event(time, self.id, self.source, status, body))
  100. return es
  101. class CoalescedEvent(Event):
  102. Sequence = 'Sequence'
  103. def __init__(self, shelf_life=None, max_life=None):
  104. self.time = None
  105. self.id = None
  106. self.source = None
  107. self.state = CoalescedEvent.Sequence
  108. self.body = ""
  109. self.ended = False
  110. self.lasttime = None
  111. self.staletime = None
  112. self.maxtime = None
  113. self.shelf_life = None
  114. if shelf_life:
  115. self.shelf_life = timestamp.TimeStamp(shelf_life)
  116. self.max_life = None
  117. if max_life:
  118. self.max_life = timestamp.TimeStamp(max_life)
  119. def add(self, event):
  120. if self.time is None:
  121. self.time = event.time
  122. if self.max_life:
  123. self.maxtime = self.time + self.max_life
  124. self.id = event.id
  125. self.source = event.source
  126. self.lasttime = event.time
  127. if self.shelf_life:
  128. self.staletime = self.lasttime + self.shelf_life
  129. if event.state == Event.End:
  130. self.body += "%s:Quit\n+++\n" % (event.time)
  131. self.ended = True
  132. elif event.state == Event.Query:
  133. self.body += "%s:%s\n+++\n" % (event.time, event.body)
  134. # Ignore Event.QueryResponse, because we only care about queries sent.
  135. def endIfNeeded(self):
  136. if not self.ended:
  137. self.add(Event(self.lasttime, self.id, self.source,
  138. Event.End, 'Quit'))
  139. #@traced_func
  140. def parse_stanza(input):
  141. # This function is a bit gnarly because it can handle either the output of
  142. # mk-query-digest or the output of mysql_watcher/mysql_logger.
  143. #
  144. # mk-query-digest parsing code borrowed from Dante Linden.
  145. seconds = 0
  146. id = ''
  147. source = ''
  148. state = Event.Query
  149. admin_command = ''
  150. line = ''
  151. body = ''
  152. match = None
  153. while not match and not query_log_commentRE.match(line):
  154. line = input.readline()
  155. if line == '': # empty string means EOF
  156. return None
  157. if commentRE.match(line): # catch comments before the headers
  158. line = input.readline() # Skip the line
  159. # parse the header in case it's a seq log
  160. match = headerRE.match(line)
  161. if match:
  162. seconds = match.group('time')
  163. id = match.group('id')
  164. source = match.group('source') or ''
  165. state = match.group('state')
  166. line = input.readline()
  167. # if processing a mk-query-digest query log, extract info from comments
  168. while query_log_commentRE.match(line):
  169. if timeRE.match(line):
  170. # if seconds, then you've hit another query in a digest log because
  171. # the previous query had no body (i.e., the comments from one query
  172. # border the comments for the next query)
  173. if seconds:
  174. # seek backward so we can process this timestamp as part of the
  175. # next stanza
  176. input.unreadline(line)
  177. return Event(float(seconds), id, source, state, body)
  178. timestamp = timeRE.match(line).groups()[0]
  179. # convert timestamp into seconds since epoch:
  180. # strip off subseconds, convert remainder to seconds,
  181. # append subseconds
  182. parts = timestamp.split('.')
  183. if len(parts) == 2:
  184. date_time, subseconds = timestamp.split('.')
  185. else:
  186. date_time = timestamp
  187. subseconds = 0
  188. seconds = str(int(time.mktime(time.strptime(date_time, "%y%m%d %H:%M:%S")))) + ".%s" % subseconds
  189. if clientRE.match(line):
  190. id = clientRE.match(line).groups()[0]
  191. if threadRE.match(line):
  192. id += ':%s' % threadRE.match(line).groups(0)
  193. if admin_commandRE.match(line):
  194. admin_command = admin_commandRE.match(line).groups()[0]
  195. if admin_command == "Quit":
  196. state = Event.End
  197. line = input.readline()
  198. if line == '':
  199. return None
  200. # we should be to the body of the stanza now
  201. while True:
  202. while commentRE.match(line):
  203. line = input.readline() # Skip the line
  204. if line == '': # EOF
  205. break
  206. if breakRE.match(line):
  207. break
  208. if query_log_commentRE.match(line):
  209. break
  210. body += line
  211. line = input.readline()
  212. # any admin commands follow the body
  213. if admin_commandRE.match(line):
  214. admin_command = admin_commandRE.match(line).groups()[0]
  215. if admin_command == "Quit":
  216. state = Event.End
  217. else:
  218. # the last line we read was a comment. seek backwards
  219. # so that we see it the next time we read from input
  220. input.unreadline(line)
  221. #print "seconds=%s, id=%s, body=%s" % (seconds, id, body)
  222. return Event(float(seconds), id, source, state, body)
  223. class Sequence(object):
  224. def __init__(self):
  225. self._count = 0
  226. self._time_start = None
  227. self._last_event = None
  228. def note(self, event):
  229. self._count += 1
  230. if self._time_start is None:
  231. self._time_start = event.time
  232. self._last_event = event
  233. def count(self):
  234. return self._count
  235. def time(self):
  236. return self._last_event.time - self._time_start
  237. def timeto(self, event):
  238. if self._last_event is None:
  239. return None
  240. return event.time - self._last_event.time
  241. def ended(self):
  242. return self._last_event.state == Event.End
  243. def generateEnd(self, t=None):
  244. e = self._last_event
  245. if t is None:
  246. t = e.time
  247. return Event(t, e.id, e.source, Event.End, "")
  248. class RawEventReader(object):
  249. def __init__(self, input):
  250. self._input = input
  251. def __iter__(self):
  252. while True:
  253. s = parse_stanza(self._input)
  254. if s is None:
  255. return
  256. yield s
  257. class EventReader(object):
  258. def __init__(self, input):
  259. self._input = input
  260. def __iter__(self):
  261. while True:
  262. s = parse_stanza(self._input)
  263. if s is None:
  264. return
  265. if s.state == CoalescedEvent.Sequence:
  266. for t in s.events():
  267. yield t
  268. else:
  269. yield s
  270. #@traced_func
  271. def input_events(specs):
  272. if len(specs) == 0:
  273. specs = ['-']
  274. evs = [EventReader(EventFile(spec)) for spec in specs]
  275. return mergetools.imerge(*evs)
  276. def split_events(specs, dest_prefix, num_splits):
  277. if len(specs) == 0:
  278. specs = ['-']
  279. dests = [open(dest_prefix + str(n), 'w') for n in xrange(num_splits)]
  280. evs = [EventReader(EventFile(spec)) for spec in specs]
  281. merged = mergetools.imerge(*evs)
  282. start_time = time.time()
  283. for num, event in enumerate(merged):
  284. if num % 10000 == 0:
  285. elapsed = time.time() - start_time
  286. print "split %d events in %s seconds (%.2f events/sec)..." % (num, elapsed, float(num) / float(elapsed))
  287. dests[num % num_splits].write(str(event))
  288. def pickle_events(specs, dest):
  289. if len(specs) == 0:
  290. specs = ['-']
  291. if not isinstance(dest, file):
  292. dest = open(dest, 'w')
  293. evs = [EventReader(EventFile(spec)) for spec in specs]
  294. merged = mergetools.imerge(*evs)
  295. start_time = time.time()
  296. for num, event in enumerate(merged):
  297. if num % 10000 == 0:
  298. elapsed = time.time() - start_time
  299. print "pickled %d events in %s seconds (%.2f events/sec)..." % (num, elapsed, float(num) / float(elapsed))
  300. if event.state == CoalescedEvent.Sequence:
  301. for subevent in event.events():
  302. cPickle.dump(subevent, file=dest)
  303. else:
  304. cPickle.dump(event, file=dest)
  305. def input_pickled_events(specs):
  306. for spec in specs:
  307. sequence_file = open(spec)
  308. while True:
  309. try:
  310. event = cPickle.load(sequence_file)
  311. if event.state == CoalescedEvent.Sequence:
  312. for subevent in event.events():
  313. yield subevent
  314. else:
  315. yield event
  316. except EOFError:
  317. break
  318. class FollowSequences(object):
  319. def replay(self, events):
  320. connections = { }
  321. lastt = None;
  322. for e in events:
  323. id = e.id
  324. lastt = e.time
  325. if id not in connections:
  326. s = connections[id] = Sequence()
  327. self.addingSequence(s, e)
  328. else:
  329. s = connections[id]
  330. self.notingEvent(s, e)
  331. s.note(e)
  332. if s.ended():
  333. self.removingSequence(s, e)
  334. del connections[id]
  335. if False:
  336. expired = []
  337. for (id,s) in connections.iteritems():
  338. w = s.timeto(e)
  339. if w and float(w) > 60.0:
  340. expired.append((id,s))
  341. for (id,s) in expired:
  342. f = s.generateEnd(e.time)
  343. self.forcedEnd(s, f)
  344. self.removingSequence(s, f)
  345. del connections[id]
  346. for s in connections.itervalues():
  347. f = s.generateEnd(lastt)
  348. self.forcedEnd(s, f)
  349. self.removingSequence(s, f)
  350. def addingSequence(self, s, e):
  351. pass
  352. def notingEvent(self, s, e):
  353. pass
  354. def forcedEnd(self, s, e):
  355. pass
  356. def removingSequence(self, s, e):
  357. pass
  358. class CoalesceSequences(object):
  359. def __init__(self):
  360. self.connections = { }
  361. self.bytime = [ ]
  362. self.starttime = None
  363. self.lasttime = None
  364. def heartbeat(self, n):
  365. if self.bytime:
  366. sys.stderr.write("front of the queue = %s\n" % self.bytime[0].id)
  367. sys.stderr.write("%s: %d events... (%d connections, %d waiting)\n"
  368. % (str(self.lasttime - self.starttime), n, len(self.connections), len(self.bytime)))
  369. # The stuff below summarizes the queue in this format:
  370. # : <item> : <item> : <item>
  371. # Where <item> is one of:
  372. # connection id(time since last message seen)
  373. # -- n -- where n is a number of ended connections just waiting to be printed
  374. #
  375. # Up to the first 5 connections found will be printed, along with the gaps of ended connections
  376. # between them.
  377. n = 0
  378. i = 0
  379. l = len(self.bytime)
  380. s = ""
  381. while n < 5 and i < l:
  382. en = 0
  383. while i < l and self.bytime[i].ended:
  384. en += 1
  385. i += 1
  386. if en > 0:
  387. s += " : --%d--" % en
  388. else:
  389. n += 1
  390. s += " : %s(%s)" % (self.bytime[i].id, str(self.lasttime - self.bytime[i].lasttime))
  391. i += 1
  392. sys.stderr.write(" ")
  393. sys.stderr.write(s)
  394. sys.stderr.write("\n")
  395. def age_out(self, c):
  396. if c.staletime and self.lasttime >= c.staletime:
  397. sys.stderr.write(" expiring %s, stale\n" % c.id)
  398. elif c.maxtime and self.lasttime >= c.maxtime:
  399. sys.stderr.write(" expiring %s, maxed out\n" % c.id)
  400. else:
  401. return False
  402. c.endIfNeeded()
  403. del self.connections[c.id]
  404. return True
  405. def flush_completed(self):
  406. """Print query sequences that have completed.
  407. Query sequences are always printed in order of the time the
  408. sequence started. Sequence endings are sometimes not present
  409. in the event stream, in which case we must wait until the
  410. sequence times out before printing it out. No sequences after
  411. the "stuck" sequence will be printed until it times out.
  412. """
  413. bytime = self.bytime
  414. while bytime:
  415. c = bytime[0]
  416. if not c.ended:
  417. if not self.age_out(c):
  418. return
  419. heapq.heappop(bytime)
  420. self.fullSequence(c)
  421. def flush_all(self):
  422. """Flush all sequences.
  423. Sequences are flushed even if no end event has been seen and
  424. they have not timed out yet.
  425. """
  426. while self.bytime:
  427. c = heapq.heappop(self.bytime)
  428. c.endIfNeeded()
  429. self.fullSequence(c)
  430. def replay(self, events):
  431. """Correlate an interleaved query stream into sequences of queries."""
  432. # n = number of events seen
  433. n = 0
  434. # s = number of sequences seen
  435. s = 0
  436. # self.connections tracks open connections for which we have not seen an end event.
  437. connections = self.connections
  438. # bytime contains all queries that have not yet been printed as
  439. # sequences. It is a min-heap ordered by time, so that the
  440. # earliest event is always first in the list.
  441. bytime = self.bytime
  442. for e in events:
  443. id = e.id
  444. self.lasttime = e.time
  445. if self.starttime is None:
  446. self.starttime = self.lasttime
  447. n += 1
  448. if n % 10000 == 0:
  449. # Print stats every 10000 events.
  450. self.heartbeat(n)
  451. # If this connection is already in the lsit of open connections,
  452. # see if it's stale or too old. Sometimes the query stream doesn't
  453. # contain the End event for a given connection, so we need to time
  454. # it out.
  455. if id in connections:
  456. c = connections[id]
  457. self.age_out(c)
  458. # At this point, the connection may have been aged out and removed from
  459. # self.connections. Otherwise, this could be the first message seen
  460. # for this connection. In either case, make a new connection.
  461. if id not in connections:
  462. s += 1
  463. c = connections[id] = CoalescedEvent(300.0, 900.0)
  464. c.add(e)
  465. heapq.heappush(bytime, c)
  466. else:
  467. c.add(e)
  468. # Check if the connection is closing. If so, remove it from
  469. # self.connections, but don't print it out yet. Events must
  470. # be printed in order.
  471. if e.state == Event.End:
  472. del connections[id]
  473. self.flush_completed()
  474. for d in connections.itervalues():
  475. d.endIfNeeded()
  476. self.flush_all()
  477. print >> sys.stderr, "%d events processed; %d sequences produced" % (n, s)
  478. def fullSequence(self, e):
  479. raise NotImplemented("fullSequence() should be implemented by a child class")