PageRenderTime 54ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/mysql_watcher/dblibs/dbutil.py

https://bitbucket.org/lindenlab/apiary/
Python | 1253 lines | 1219 code | 5 blank | 29 comment | 17 complexity | 3345fa671f7633ad7462092c7cba8b87 MD5 | raw file
  1. #!/usr/bin/env python
  2. #
  3. # $LicenseInfo:firstyear=2007&license=mit$
  4. #
  5. # Copyright (c) 2007-2010, Linden Research, Inc.
  6. #
  7. # Permission is hereby granted, free of charge, to any person obtaining a copy
  8. # of this software and associated documentation files (the "Software"), to deal
  9. # in the Software without restriction, including without limitation the rights
  10. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11. # copies of the Software, and to permit persons to whom the Software is
  12. # furnished to do so, subject to the following conditions:
  13. #
  14. # The above copyright notice and this permission notice shall be included in
  15. # all copies or substantial portions of the Software.
  16. #
  17. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23. # THE SOFTWARE.
  24. # $/LicenseInfo$
  25. #
  26. #
  27. # Utility classes that allow us to monitor and keep track of databases
  28. #
  29. import array
  30. import binascii
  31. import gzip
  32. import math
  33. import os
  34. import re
  35. import socket
  36. import string
  37. import struct
  38. import sys
  39. import time
  40. from llbase import llsd
  41. def asciify(str):
  42. "Lame ASCIIfication of a string to keep various things from barfing"
  43. out_str = ""
  44. for ch in str:
  45. if (ch >= chr(0x9)) and (ch <= '~'):
  46. out_str += ch
  47. else:
  48. out_str += "."
  49. return out_str
  50. def all_as_maps(cursor):
  51. """Return all of the cursor with maps for each row instead of sequences"""
  52. all_seq = cursor.fetchall()
  53. ret_all = []
  54. descs = cursor.description
  55. for row in all_seq:
  56. new_row = {}
  57. count = 0
  58. for desc in descs:
  59. new_row[desc[0]] = row[count]
  60. count += 1
  61. ret_all.append(new_row)
  62. return ret_all
  63. #
  64. # Cache IP to string lookup to make it faster
  65. #
  66. ip_table = {}
  67. def lookup_ip_string(ip_bin):
  68. if not ip_bin in ip_table:
  69. ip_table[ip_bin] = "%d.%d.%d.%d" % ((ip_bin & 0xff000000L) >> 24,
  70. (ip_bin & 0x00ff0000L) >> 16,
  71. (ip_bin & 0x0000ff00L) >> 8,
  72. ip_bin & 0x000000ffL)
  73. return ip_table[ip_bin]
  74. def llquery_from_llsd(query_llsd):
  75. # Hack, fill in arbitary data for info that isn't serialized
  76. query = LLQuery(None, None, query_llsd['query'], 0.0)
  77. query.mData['host_clean'] = query_llsd['host_clean']
  78. query.mData['query_clean'] = query_llsd['query_clean']
  79. # Hack, keeps correctOutliers from trashing the data
  80. #query.mNumQueries = query_llsd['num_queries']
  81. #query.mTotalTime = query_llsd['total_time']
  82. try:
  83. query.mNumQueriesCorrected = query_llsd['num_queries_corrected']
  84. query.mTotalTimeCorrected = query_llsd['total_time_corrected']
  85. except:
  86. # Hack for old output which didn't generate this data
  87. query.mNumQueriesCorrected = query_llsd['num_queries']
  88. query.mTotalTimeCorrected = query_llsd['total_time']
  89. return query
  90. def get_query_tables(query):
  91. "Return the list of tables in a query"
  92. #
  93. # Really dumb method, literally iterates through a bunch of regular expressions to pull this out.
  94. # There are probably better methods out there.
  95. #
  96. out_tables = []
  97. # Clean up the query
  98. query = query.replace('\n',' ')
  99. query = re.sub('\s+', ' ', query)
  100. m = LLQuery.sSelectWhereRE.match(query)
  101. if m:
  102. # Split apart by commas
  103. tables = m.group(1).split(',')
  104. for table in tables:
  105. # Take the first part (which is table name)
  106. out_tables.append(string.strip(table.split()[0]))
  107. return out_tables
  108. m = LLQuery.sSelectRE.match(query)
  109. if m:
  110. out_tables.append(string.strip(m.group(1)))
  111. return out_tables
  112. m = LLQuery.sUpdateRE.match(query)
  113. if m:
  114. # Split apart by commas
  115. tables = m.group(1).split(',')
  116. for table in tables:
  117. # Take the first part (which is table name)
  118. out_tables.append(string.strip(table.split()[0]))
  119. return out_tables
  120. m = LLQuery.sReplaceRE.match(query)
  121. if m:
  122. out_tables.append(string.strip(m.group(1)))
  123. return out_tables
  124. m = LLQuery.sInsertRE.match(query)
  125. if m:
  126. out_tables.append(string.strip(m.group(1)))
  127. return out_tables
  128. m = LLQuery.sDeleteRE.match(query)
  129. if m:
  130. out_tables.append(string.strip(m.group(1)))
  131. return out_tables
  132. return out_tables
  133. MIN_BIN=-15
  134. MAX_BIN=10
  135. class LLQuery:
  136. "Represents all of the data associated with a query"
  137. fromLLSDStats = staticmethod(llquery_from_llsd)
  138. def __init__(self, host, port, query, start_time):
  139. # Store information which will be serialized for metadata in a map
  140. self.mData = {}
  141. self.mData['host'] = host
  142. self.mData['port'] = port
  143. self.mData['query'] = query
  144. # Metadata
  145. self.mData['host_clean'] = None
  146. self.mData['host_full'] = None
  147. self.mData['query_clean'] = None
  148. self.mData['tables'] = []
  149. #
  150. # Stats information
  151. #
  152. self.mNumQueries = 0
  153. self.mTotalTime = 0.0
  154. self.mOutQueries = 0
  155. self.mTotalTimeCorrected = 0.0 # Corrected to remove outliers
  156. self.mNumQueriesCorrected = 0 # Corrected to remove outliers
  157. # LLQueryStatBins for the query time histogram, as well as corrected time
  158. # Query times are collected into bins based on power of 2 execution times (in seconds).
  159. # Each bin collects the number of queries and total execution time. See LLQueryStatBin
  160. # for more details
  161. self.mBins = {} # Bins for histogram
  162. # This stuff doesn't usually get serialized
  163. self.mQueryLen = len(query)
  164. self.mStartTime = start_time
  165. self.mResponseTime = start_time
  166. def __hash__(self):
  167. return (self.mData['host_clean'] + ":" + self.mData['query_clean']).__hash__()
  168. def __eq__(self, other):
  169. # Note, this matches on clean, not strictly correct
  170. if ((self.mData['query_clean'] == other.mData['query_clean']) and
  171. (self.mData['host_clean'] == other.mData['host_clean'])):
  172. return True
  173. return False
  174. def getKey(self):
  175. # The string key is just the clean host and query, concatenated
  176. return self.mData['host_clean'] + ":" + self.mData['query_clean']
  177. def clean(self):
  178. "Generate the clean query so it can be used for statistics"
  179. if not self.mData['host_clean']:
  180. (self.mData['host_clean'], self.mData['host_full']) = get_host_type(self.mData['host'])
  181. self.mData['query_clean'] = clean_query(self.mData['query'], 0)
  182. def getAvgTimeCorrected(self):
  183. "Average time per query, corrected for outliers"
  184. return self.mTotalTimeCorrected/self.mNumQueriesCorrected
  185. def queryStart(self):
  186. "When collecting query stats, use this when the query is receieved"
  187. self.mNumQueries += 1
  188. self.mOutQueries += 1
  189. def queryResponse(self, elapsed):
  190. "When collecting stats, use this when the response is received"
  191. self.mTotalTime += elapsed
  192. self.mOutQueries -=1
  193. # Determine which stat bin this query is in
  194. bin = MIN_BIN
  195. if elapsed:
  196. bin = int(math.log(elapsed,2))
  197. bin = max(MIN_BIN, bin)
  198. bin = min(MAX_BIN, bin)
  199. if bin not in self.mBins:
  200. self.mBins[bin] = LLQueryStatBin(bin)
  201. self.mBins[bin].accumulate(elapsed)
  202. def correctOutliers(self):
  203. "Find outliers bins and calculate corrected results"
  204. # Outlier bins have query counts which are 3 orders of magnitude less than the total count for that query
  205. if not self.mNumQueries:
  206. # FIXME: This is a hack because we don't save this information in the query count dump
  207. return
  208. min_queries = self.mNumQueries/100
  209. self.mTotalTimeCorrected = 0.0
  210. self.mNumQueriesCorrected = 0
  211. for i in self.mBins.keys():
  212. if self.mBins[i].mNumQueries < min_queries:
  213. # Outlier, flag as such.
  214. self.mBins[i].mOutlier = True
  215. else:
  216. self.mTotalTimeCorrected += self.mBins[i].mTotalTime
  217. self.mNumQueriesCorrected += self.mBins[i].mNumQueries
  218. if self.mNumQueriesCorrected == 0:
  219. #HACK: Deal with divide by zero
  220. self.mNumQueriesCorrected = 1
  221. # Miscellaneous regular expressions to analyze the query type
  222. sReadRE = re.compile("(SELECT.*)|(USE.*)", re.IGNORECASE)
  223. sSelectWhereRE = re.compile("\(?\s*?SELECT.+?FROM\s+\(?(.*?)\)?\s+WHERE.*", re.IGNORECASE)
  224. sSelectRE = re.compile("\(?\s*?SELECT.+?FROM\s+(.+)(?:\s+LIMIT.*|.*)", re.IGNORECASE)
  225. sUpdateRE = re.compile("UPDATE\s+(.+?)\s+SET.*", re.IGNORECASE)
  226. sReplaceRE = re.compile("REPLACE INTO\s+(.+?)(?:\s*\(|\s+SET).*", re.IGNORECASE)
  227. sInsertRE = re.compile("INSERT.+?INTO\s+(.+?)(?:\s*\(|\s+SET).*", re.IGNORECASE)
  228. sDeleteRE = re.compile("DELETE.+?FROM\s+(.+?)\s+WHERE.*", re.IGNORECASE)
  229. def analyze(self):
  230. "Does some query analysis on the query"
  231. query = self.mData['query_clean']
  232. self.mData['tables'] = get_query_tables(query)
  233. if 'type' in self.mData:
  234. # Already analyzed
  235. return
  236. if LLQuery.sReadRE.match(query):
  237. self.mData['type'] = 'read'
  238. else:
  239. self.mData['type'] = 'write'
  240. def dumpLine(self, elapsed, query_len = 0):
  241. "Dump a semi-human-readable stats line for reporting"
  242. bin_str = ''
  243. for i in range(MIN_BIN,MAX_BIN+1):
  244. if i in self.mBins:
  245. if self.mBins[i].mOutlier:
  246. bin_str += '*'
  247. else:
  248. bin_str += str(int(math.log10(self.mBins[i].mNumQueries)))
  249. else:
  250. bin_str += '.'
  251. if not query_len:
  252. query_len = 4096
  253. num_queries = self.mNumQueriesCorrected
  254. if not num_queries:
  255. num_queries = 1
  256. return ("%s\t%5d\t%6.2f\t%6.2f\t%1.4f\t%s\t" % (bin_str, num_queries,
  257. num_queries/elapsed, self.mTotalTimeCorrected,
  258. self.mTotalTimeCorrected/num_queries, self.mData['host_clean'])) \
  259. + self.mData['query_clean'][0:query_len]
  260. def as_map(self):
  261. "Make an LLSD map version of data that can be used for merging"
  262. self.analyze()
  263. self.mData['num_queries'] = self.mNumQueries
  264. self.mData['total_time'] = self.mTotalTime
  265. self.mData['num_queries_corrected'] = self.mNumQueriesCorrected
  266. self.mData['total_time_corrected'] = self.mTotalTimeCorrected
  267. return self.mData
  268. class LLConnStatus:
  269. "Keeps track of the status of a connection talking to mysql"
  270. def __init__(self, ip_port, start_time):
  271. self.mLastMysqlPacketNumber = 0
  272. self.mNumPackets = 0
  273. self.mIPPort = ip_port
  274. self.mStartTime = start_time
  275. self.mLastUpdate = start_time
  276. self.mCurState = ""
  277. self.mLastQuery = None
  278. self.mNumQueries = 0
  279. def quit(self, src_ip, src_port, pkt_time):
  280. query = LLQuery(src_ip, src_port, "Quit", pkt_time)
  281. query.clean()
  282. self.mLastUpdate = pkt_time
  283. self.mLastQuery = query
  284. self.mNumPackets += 1
  285. def queryStart(self, src_ip, src_port, pkt_time, raw, pkt_len, offset):
  286. query_len = pkt_len - 1
  287. query = LLQuery(src_ip, src_port, raw[offset:offset + (pkt_len - 1)], pkt_time)
  288. self.mLastUpdate = pkt_time
  289. # Packet length includes the command, offset into raw doesn't
  290. if query_len > (len(raw) - offset):
  291. query.mQueryLen = query_len
  292. self.mCurState = "SendingQuery"
  293. else:
  294. self.mCurState = "QuerySent"
  295. query.clean()
  296. self.mNumQueries += 1
  297. self.mLastQuery = query
  298. self.mNumPackets += 1
  299. def queryStartProcessed(self, src_ip, src_port, pkt_time, query_str):
  300. query = LLQuery(src_ip, src_port, query_str, pkt_time)
  301. query.clean()
  302. self.mLastUpdate = pkt_time
  303. self.mCurState = "QuerySent"
  304. self.mNumQueries += 1
  305. self.mLastQuery = query
  306. self.mNumPackets += 1
  307. def updateNonCommand(self, pkt_time, raw):
  308. # Clean up an existing query if you get a non-command.
  309. self.mNumPackets += 1
  310. self.mLastUpdate = pkt_time
  311. if self.mLastQuery:
  312. if self.mCurState == "SendingQuery":
  313. # We're continuing a query
  314. # We won't generate a new clean version, because it'll $!@# up all the sorting.
  315. self.mLastQuery.mData['query'] += raw
  316. if len(self.mLastQuery.mData['query']) == self.mLastQuery.mQueryLen:
  317. self.mCurState = "QuerySent"
  318. self.mLastQuery.clean()
  319. return
  320. else:
  321. #
  322. # A non-command that's continuing a query. Not sure why this is happening,
  323. # but clear the last query to avoid generating inadvertent long query results.
  324. #
  325. self.mLastQuery = None
  326. # Default to setting state to "NonCommand"
  327. self.mCurState = "NonCommand"
  328. def updateResponse(self, pkt_time, result_type):
  329. # If we've got a query running, accumulate the elapsed time
  330. start_query_response = False
  331. if self.mCurState == "QuerySent":
  332. lq = self.mLastQuery
  333. if lq:
  334. if lq.mStartTime == 0.0:
  335. lq.mStartTime = pkt_time
  336. lq.mResponseTime = pkt_time
  337. start_query_response = True
  338. self.mLastUpdate = pkt_time
  339. if result_type == 0:
  340. self.mCurState = "Result:RecvOK"
  341. elif result_type == 0xff:
  342. self.mCurState = "Result:Error"
  343. elif result_type == 0xfe:
  344. self.mCurState = "Result:EOF"
  345. elif result_type == 0x01:
  346. self.mCurState = "Result:Header"
  347. else:
  348. self.mCurState = "Result:Data"
  349. return start_query_response
  350. def dump(self):
  351. if self.mLastQuery:
  352. print "%s: NumQ: %d State:%s\n\tLast: %s" % (self.mIPPort, self.mNumQueries, self.mCurState,
  353. self.mLastQuery.mData['query_clean'][0:40])
  354. else:
  355. print "%s: NumQ: %d State:%s\n\tLast: None" % (self.mIPPort, self.mNumQueries, self.mCurState)
  356. class LLQueryStatBin:
  357. "Keeps track of statistics for one query bin"
  358. def __init__(self, power):
  359. self.mMinTime = pow(2, power)
  360. self.mMaxTime = pow(2, power+1)
  361. self.mTotalTime = 0
  362. self.mNumQueries = 0
  363. self.mOutlier = False
  364. def accumulate(self, elapsed):
  365. self.mTotalTime += elapsed
  366. self.mNumQueries += 1
  367. def dump_query_stat_header():
  368. return "LogHistogram (-15:10) \tCount\tQPS\tTotal\tAvg\tHost\tQuery"
  369. class LLQueryStatMap:
  370. def __init__(self, description, start_time):
  371. self.mDescription = description
  372. self.mQueryMap = {}
  373. self.mStartTime = start_time
  374. self.mFinalTime = 0
  375. self.mLastTime = self.mStartTime
  376. self.mQueryStartCount = 0
  377. self.mQueryResponseCount = 0
  378. def load(self, fn):
  379. "Load dumped query stats from an LLSD file"
  380. # Read in metadata
  381. in_file = open(fn)
  382. in_string = in_file.read()
  383. in_file.close()
  384. in_llsd = llsd.LLSD.parse(in_string)
  385. info = in_llsd[0]
  386. query_list = in_llsd[1]
  387. self.mDescription = info['description']
  388. self.mStartTime = info['start_time']
  389. self.mLastTime = info['last_time']
  390. self.mFinalTime = info['last_time']
  391. self.mQueryStartCount = info['query_start_count']
  392. self.mQueryResponseCount = info['query_response_count']
  393. # Iterate through all the queries, and populate the query map.
  394. for query_row in query_list:
  395. query = LLQuery.fromLLSDStats(query_row)
  396. self.mQueryMap[query.getKey()] = query
  397. def analyze(self):
  398. for query in self.mQueryMap.values():
  399. query.analyze()
  400. def queryStart(self, query):
  401. if not query in self.mQueryMap:
  402. #query.analyze()
  403. self.mQueryMap[query] = query
  404. self.mQueryMap[query].queryStart()
  405. # Update elapsed time for this map
  406. self.mLastTime = query.mStartTime
  407. if self.mLastTime < self.mStartTime:
  408. self.mStartTime = self.mLastTime
  409. if self.mLastTime > self.mFinalTime:
  410. self.mFinalTime = self.mLastTime
  411. self.mQueryStartCount += 1
  412. def queryResponse(self, query):
  413. if not query in self.mQueryMap:
  414. self.queryStart(query)
  415. elapsed = query.mResponseTime - query.mStartTime
  416. self.mQueryMap[query].queryResponse(elapsed)
  417. self.mLastTime = query.mResponseTime
  418. if self.mLastTime > self.mFinalTime:
  419. self.mFinalTime = self.mLastTime
  420. self.mQueryResponseCount += 1
  421. def getElapsedTime(self):
  422. return self.mFinalTime - self.mStartTime
  423. def getQPS(self):
  424. return self.mQueryStartCount / self.getElapsedTime()
  425. def correctOutliers(self):
  426. for query in self.mQueryMap.values():
  427. query.correctOutliers()
  428. def getSortedKeys(self, sort_by = "total_time"):
  429. "Gets a list of keys sorted by sort type"
  430. self.correctOutliers()
  431. items = self.mQueryMap.items()
  432. backitems = None
  433. if sort_by == "total_time":
  434. backitems = [[v[1].mTotalTimeCorrected, v[0]] for v in items]
  435. elif sort_by == "count":
  436. backitems = [[v[1].mNumQueriesCorrected, v[0]] for v in items]
  437. elif sort_by == "avg_time":
  438. backitems = [[v[1].getAvgTimeCorrected(), v[0]] for v in items]
  439. else:
  440. # Fallback, sort by total time
  441. backitems = [[v[1].mTotalTimeCorrected, v[0]] for v in items]
  442. backitems.sort()
  443. backitems.reverse()
  444. # Get the keys out of the items
  445. sorted = []
  446. for pair in backitems:
  447. sorted.append(pair[1])
  448. return sorted
  449. def getSortedStats(self, sort_by = "total_time", num_stats = 0):
  450. "Gets a list of the top queries according to sort type"
  451. sorted_keys = self.getSortedKeys(sort_by)
  452. if num_stats == 0:
  453. l = len(sorted_keys)
  454. else:
  455. l = min(num_stats, len(sorted_keys))
  456. stats = []
  457. for i in range(0, l):
  458. stats.append(self.mQueryMap[sorted_keys[i]])
  459. return stats
  460. def dumpStatus(self, sort_type = "total_time", elapsed = None):
  461. # Dump status according to total time
  462. if not elapsed:
  463. elapsed = self.getElapsedTime()
  464. sorted_stats = self.getSortedStats(sort_type)
  465. for query in sorted_stats:
  466. print query.dumpLine(elapsed, 60)
  467. def dumpLLSD(self, filename):
  468. # Analyze queries to generate metadata
  469. self.analyze()
  470. # Dump an LLSD document representing the entire object
  471. out = []
  472. # First, dump all the metadata into the first block
  473. info_map = {}
  474. info_map['description'] = self.mDescription
  475. info_map['start_time'] = self.mStartTime
  476. info_map['last_time'] = self.mLastTime
  477. info_map['query_start_count'] = self.mQueryStartCount
  478. info_map['query_response_count'] = self.mQueryResponseCount
  479. out.append(info_map)
  480. # Dump all of the query info into the second block
  481. sorted_stats = self.getSortedStats("total_time")
  482. query_list = []
  483. for query in sorted_stats:
  484. query_list.append(query.as_map())
  485. out.append(query_list)
  486. f = open(filename, "w")
  487. f.write(str(llsd.LLSD(out)))
  488. f.close()
  489. def dumpTiming(self, filename):
  490. cur_time = time.time()
  491. f = open(filename, "w")
  492. f.write(dump_query_stat_header() + "\n")
  493. # Sort the queries
  494. sorted_stats = self.getSortedStats("total_time")
  495. for query in sorted_stats:
  496. f.write(query.dumpLine(cur_time - self.mStartTime))
  497. f.write("\n")
  498. f.close()
  499. def dumpCountsLLSD(self, filename):
  500. "Dump the query statistics as an LLSD doc, for later merging with the query_info doc"
  501. out = []
  502. # Put the metadata into a map
  503. info_map = {}
  504. info_map['description'] = self.mDescription
  505. info_map['start_time'] = self.mStartTime
  506. info_map['last_time'] = self.mLastTime
  507. info_map['query_start_count'] = self.mQueryStartCount
  508. info_map['query_response_count'] = self.mQueryResponseCount
  509. out.append(info_map)
  510. sorted_stats = self.getSortedStats("total_time")
  511. query_list = []
  512. for query in sorted_stats:
  513. query_row = {}
  514. # We only want to dump identifying info and stats, not metadata
  515. query_row['host_clean'] = query.mData['host_clean']
  516. # Convert the queries to utf-8 to make sure it doesn't break XML
  517. try:
  518. u = unicode(query.mData['query_clean'])
  519. query_row['query_clean'] = u.encode('utf-8')
  520. except:
  521. query_row['query_clean'] = 'NON-UTF8'
  522. try:
  523. u = unicode(query.mData['query'])
  524. query_row['query'] = u.encode('utf-8')
  525. except:
  526. query_row['query'] = 'NON-UTF8'
  527. query_row['count'] = query.mNumQueriesCorrected
  528. query_row['total_time'] = query.mTotalTimeCorrected
  529. query_row['avg_time'] = query.getAvgTimeCorrected()
  530. query_list.append(query_row)
  531. out.append(query_list)
  532. f = open(filename, "w")
  533. f.write(str(llsd.LLSD(out)))
  534. f.close()
  535. class LLBinnedQueryStats:
  536. "Keeps track of a fixed number of N minute bins of query stats"
  537. def __init__(self):
  538. self.mHourBins = {} # This will be keyed by unixtime seconds, eventually
  539. self.mMinuteBins = {}
  540. self.mLastUpdateHour = 0
  541. self.mLastUpdateMinute = 0
  542. def dumpTiming(self, path):
  543. # Dump hour bins
  544. for (key, value) in self.mHourBins.items():
  545. value.dumpTiming("%s/hour-%s-query_timing.txt" % (path, key))
  546. # Dump minute bins
  547. for (key, value) in self.mMinuteBins.items():
  548. value.dumpTiming("%s/minute-%s-query_timing.txt" % (path, key))
  549. def dumpCountsLLSD(self, path):
  550. # Dump hour bins
  551. for (key, value) in self.mHourBins.items():
  552. value.dumpCountsLLSD("%s/hour-%s-query_counts.llsd" % (path, key))
  553. # Dump minute bins
  554. for (key, value) in self.mMinuteBins.items():
  555. value.dumpCountsLLSD("%s/minute-%s-query_counts.llsd" % (path, key))
  556. def dumpLLSD(self, path):
  557. # Dump hour bins
  558. for (key, value) in self.mHourBins.items():
  559. value.dumpLLSD("%s/hour-%s-query_dump.llsd" % (path, key))
  560. # Dump minute bins
  561. for (key, value) in self.mMinuteBins.items():
  562. value.dumpLLSD("%s/minute-%s-query_dump.llsd" % (path, key))
  563. def flushOldBins(self, time_secs):
  564. for minute_bin_str in self.mMinuteBins.keys():
  565. bin_secs = time.mktime(time.strptime(minute_bin_str, "%Y-%m-%d-%H-%M"))
  566. if (time_secs - bin_secs) > 3*3600:
  567. del self.mMinuteBins[minute_bin_str]
  568. def queryStart(self, query):
  569. "Update associated bin for the time specified, creating if necessary"
  570. # Hour and minute bins
  571. t = time.localtime(query.mStartTime)
  572. hour_bin_str = time.strftime("%Y-%m-%d-%H", t)
  573. minute_bin_str = time.strftime("%Y-%m-%d-%H-%M", t)
  574. hour = t[3]
  575. minute = t[4]
  576. # FIXME: These start times are a bit inaccurate, but should be fine under heavy query load.
  577. if not hour_bin_str in self.mHourBins:
  578. self.mHourBins[hour_bin_str] = LLQueryStatMap(hour_bin_str, query.mStartTime)
  579. if not minute_bin_str in self.mMinuteBins:
  580. self.mMinuteBins[minute_bin_str] = LLQueryStatMap(minute_bin_str, query.mStartTime)
  581. self.mHourBins[hour_bin_str].queryStart(query)
  582. self.mMinuteBins[minute_bin_str].queryStart(query)
  583. if hour != self.mLastUpdateHour:
  584. self.mLastUpdateHour = hour
  585. # If the hour changes, dump and clean out old bins
  586. self.flushOldBins(query.mStartTime)
  587. def queryResponse(self, query):
  588. "Update associated bin for the time specified, creating if necessary"
  589. # Hour and minute bins
  590. t = time.localtime(query.mStartTime)
  591. hour_bin_str = time.strftime("%Y-%m-%d-%H", t)
  592. minute_bin_str = time.strftime("%Y-%m-%d-%H-%M", t)
  593. hour = t[3]
  594. minute = t[4]
  595. # FIXME: These start times are a bit inaccurate, but should be fine under heavy query load.
  596. if not hour_bin_str in self.mHourBins:
  597. self.mHourBins[hour_bin_str] = LLQueryStatMap(hour_bin_str, query.mStartTime)
  598. if not minute_bin_str in self.mMinuteBins:
  599. self.mMinuteBins[minute_bin_str] = LLQueryStatMap(hour_bin_str, query.mStartTime)
  600. self.mHourBins[hour_bin_str].queryResponse(query)
  601. self.mMinuteBins[minute_bin_str].queryResponse(query)
  602. # MySQL protocol sniffer, using tcpdump, ncap packet parsing and mysql internals
  603. # http://forge.mysql.com/wiki/MySQL_Internals_ClientServer_Protocol
  604. class LLQueryStream:
  605. "Process a raw tcpdump stream (in raw libpcap format)"
  606. def __init__(self, in_file):
  607. self.mInFile = in_file
  608. self.mStartTime = time.time()
  609. #
  610. # A list of all outstanding "connections", and what they're doing.
  611. # This is necessary in order to get script timing and other information.
  612. #
  613. self.mConnStatus = {}
  614. self.mConnKeys = []
  615. self.mConnCleanupIndex = 0
  616. #
  617. # Parse/skip past the libpcap global header
  618. #
  619. #guint32 magic_number; /* magic number */
  620. #guint16 version_major; /* major version number */
  621. #guint16 version_minor; /* minor version number */
  622. #gint32 thiszone; /* GMT to local correction */
  623. #guint32 sigfigs; /* accuracy of timestamps */
  624. #guint32 snaplen; /* max length of captured packets, in octets */
  625. #guint32 network; /* data link type */
  626. # Skip past the libpcap global header
  627. format = 'IHHiIII'
  628. size = struct.calcsize(format)
  629. header_bin = self.mInFile.read(size)
  630. res = struct.unpack(format, header_bin)
  631. def createConnection(self, client_ip_port, pkt_time):
  632. # Track the connection, create a new one or return existing
  633. if not client_ip_port in self.mConnStatus:
  634. self.mConnStatus[client_ip_port] = LLConnStatus(client_ip_port, pkt_time)
  635. # Track a new key that we need to garbage collect
  636. self.mConnKeys.append(client_ip_port)
  637. conn = self.mConnStatus[client_ip_port]
  638. return conn
  639. def closeConnection(self, ip_port):
  640. if ip_port in self.mConnStatus:
  641. del self.mConnStatus[ip_port]
  642. def cleanupConnection(self,cur_time):
  643. # Cleanup some number of stale connections.
  644. CONNECTION_EXPIRY=900.0
  645. if self.mConnCleanupIndex >= len(self.mConnKeys):
  646. self.mConnCleanupIndex = 0
  647. # Skip if no keys
  648. if len(self.mConnKeys) == 0:
  649. return
  650. key = self.mConnKeys[self.mConnCleanupIndex]
  651. if key in self.mConnStatus:
  652. # Clean up if it's too old
  653. if self.mConnStatus[key].mLastUpdate < (cur_time - CONNECTION_EXPIRY):
  654. del self.mConnStatus[key]
  655. #print "Cleaning up old key:", key
  656. #print "num conns:", len(self.mConnStatus)
  657. #print "num keys", len(self.mConnKeys)
  658. else:
  659. # Clean up if the connection is already removed
  660. del self.mConnKeys[self.mConnCleanupIndex]
  661. self.mConnCleanupIndex += 1
  662. def getNextEvent(self):
  663. # Get the next event out of the packet stream
  664. td_format = 'IIII'
  665. ip_format = '!BBHHHBBHII'
  666. tcp_format = '!HHIIBBHHH'
  667. while 1:
  668. #
  669. # Parse out an individual packet from the tcpdump stream
  670. #
  671. # Match the packet header
  672. # Pull a record (packet) off of the wire
  673. # Packet header
  674. # guint32 ts_sec; /* timestamp seconds */
  675. # guint32 ts_usec; /* timestamp microseconds */
  676. # guint32 incl_len; /* number of octets of packet saved in file */
  677. # guint32 orig_len; /* actual length of packet */
  678. ph_bin = self.mInFile.read(16)
  679. res = struct.unpack(td_format, ph_bin)
  680. ts_sec = res[0]
  681. ts_usec = res[1]
  682. pkt_time = ts_sec + (ts_usec/1000000.0)
  683. incl_len = res[2]
  684. orig_len = res[3]
  685. # Packet data (incl_len bytes)
  686. raw_data = self.mInFile.read(incl_len)
  687. # Parse out the MAC header
  688. # Don't bother, we don't care - 14 byte header
  689. mac_offset = 14
  690. # Parse out the IP header (min 20 bytes)
  691. # 4 bits - version
  692. # 4 bits - header length in 32 bit words
  693. # 1 byte - type of service
  694. # 2 bytes - total length
  695. # 2 bytes - fragment identification
  696. # 3 bits - flags
  697. # 13 bits - fragment offset
  698. # 1 byte - TTL
  699. # 1 byte - Protocol (should be 6)
  700. # 2 bytes - header checksum
  701. # 4 bytes - source IP
  702. # 4 bytes - dest IP
  703. ip_header = struct.unpack(ip_format, raw_data[mac_offset:mac_offset + 20])
  704. # Assume all packets are TCP
  705. #if ip_header[6] != 6:
  706. # print "Not TCP!"
  707. # continue
  708. src_ip_bin = ip_header[8]
  709. src_ip = lookup_ip_string(src_ip_bin)
  710. #src_ip = "%d.%d.%d.%d" % ((src_ip_bin & 0xff000000L) >> 24,
  711. # (src_ip_bin & 0x00ff0000L) >> 16,
  712. # (src_ip_bin & 0x0000ff00L) >> 8,
  713. # src_ip_bin & 0x000000ffL)
  714. dst_ip_bin = ip_header[9]
  715. dst_ip = lookup_ip_string(dst_ip_bin)
  716. #dst_ip = "%d.%d.%d.%d" % ((dst_ip_bin & 0xff000000L) >> 24,
  717. # (dst_ip_bin & 0x00ff0000L) >> 16,
  718. # (dst_ip_bin & 0x0000ff00L) >> 8,
  719. # dst_ip_bin & 0x000000ffL)
  720. ip_size = (ip_header[0] & 0x0f) * 4
  721. # Parse out the TCP packet header
  722. # 2 bytes - src_prt
  723. # 2 bytes - dst_port
  724. # 4 bytes - sequence number
  725. # 4 bytes - ack number
  726. # 4 bits - data offset (size in 32 bit words of header
  727. # 6 bits - reserved
  728. # 6 bits - control bits
  729. # 2 bytes - window
  730. # 2 bytes - checksum
  731. # 2 bytes - urgent pointer
  732. tcp_offset = mac_offset + ip_size
  733. tcp_header = struct.unpack(tcp_format, raw_data[tcp_offset:tcp_offset+20])
  734. tcp_size = ((tcp_header[4] & 0xf0) >> 4) * 4
  735. src_port = tcp_header[0]
  736. dst_port = tcp_header[1]
  737. # 3 bytes - packet length
  738. # 1 byte - packet number
  739. # 1 byte - command
  740. # <n bytes> - args
  741. pkt_offset = tcp_offset + tcp_size
  742. if len(raw_data) == pkt_offset:
  743. continue
  744. # Clearly not a mysql packet if it's less than 5 bytes of data
  745. if len(raw_data) - pkt_offset < 5:
  746. continue
  747. src_ip_port = "%s:%d" % (src_ip, src_port)
  748. dst_ip_port = "%s:%d" % (dst_ip, dst_port)
  749. if src_port == 3306:
  750. #
  751. # We are processing traffic from mysql server -> client
  752. # This primarily is used to time how long it takes for use
  753. # to start receiving data to the client from the server.
  754. #
  755. mysql_arr = array.array('B', raw_data[pkt_offset])
  756. result_type = ord(raw_data[pkt_offset])
  757. # Get or create connection
  758. conn = self.createConnection(dst_ip_port, pkt_time)
  759. # Update the status of this connection, including query times on
  760. # connections
  761. if conn.updateResponse(pkt_time, result_type):
  762. # Event: Initial query response
  763. return "QueryResponse", conn.mLastQuery
  764. continue
  765. if dst_port == 3306:
  766. #
  767. # Processing a packet from the client to the server
  768. #
  769. # HACK! This is an easy place to put this where we can get packet time that only happens once or so per event.
  770. # Garbage collect connections
  771. self.cleanupConnection(pkt_time)
  772. # Pull out packet length from the header
  773. mysql_arr = array.array('B', raw_data[pkt_offset:pkt_offset+5])
  774. pkt_len = mysql_arr[0] + (long(mysql_arr[1]) << 8) + (long(mysql_arr[2]) << 16)
  775. pkt_number = mysql_arr[3]
  776. # Find the connection associated with this packet
  777. # Get or create connection
  778. conn = self.createConnection(src_ip_port, pkt_time)
  779. #if conn.mLastMysqlPacketNumber != (pkt_number - 1):
  780. # print "Prev:", conn.mLastMysqlPacketNumber, "Cur:", pkt_number
  781. conn.mLastMysqlPacketNumber = pkt_number
  782. cmd = mysql_arr[4]
  783. # If we're not a command, do stuff
  784. if cmd > 0x1c:
  785. # Unfortunately, we can't trivially tell the difference between
  786. # various non-command packets
  787. # Assume that these are all AuthResponses for now.
  788. conn.updateNonCommand(pkt_time, raw_data[pkt_offset:])
  789. if "QuerySent" == conn.mCurState:
  790. return ("QueryStart", conn.mLastQuery)
  791. continue
  792. query = None
  793. if cmd == 1:
  794. # Event: Quitting a connection
  795. conn.quit(src_ip, src_port, pkt_time)
  796. # This connection is closing, get rid of it
  797. self.closeConnection(src_ip_port)
  798. return ("Quit", conn.mLastQuery)
  799. elif cmd == 3:
  800. # Event: Starting a query
  801. conn.queryStart(src_ip, src_port, pkt_time, raw_data, pkt_len, pkt_offset + 5)
  802. # Only return an QueryStart if we have the whole query
  803. if "QuerySent" == conn.mCurState:
  804. return ("QueryStart", conn.mLastQuery)
  805. else:
  806. pass
  807. IP_PORT_RE = re.compile("(\S+):(\d+)")
  808. EVENT_RE = re.compile("(\S+)\t(\S+):(\d+)\t(\S+)\t(\S+)")
  809. SECTION_RE = re.compile("\*{38}")
  810. class LLLogQueryStream:
  811. "Process a query stream dump to generate a query stream class"
  812. "Process a raw tcpdump stream (in raw libpcap format)"
  813. def __init__(self, lineiter):
  814. self.mLineIter = lineiter
  815. self.mStartTime = None
  816. #
  817. # A list of all outstanding "connections", and what they're doing.
  818. # This is necessary in order to get script timing and other information.
  819. #
  820. self.mConnStatus = {}
  821. def closeConnection(self, ip_port):
  822. if ip_port in self.mConnStatus:
  823. del self.mConnStatus[ip_port]
  824. def getNextEvent(self):
  825. # Get the next event out of the file
  826. cur_event = None
  827. event_time = None
  828. event_type = None
  829. ip = None
  830. port = None
  831. ip_port = None
  832. cur_state = 'Metadata'
  833. for line in self.mLineIter:
  834. if line == '':
  835. return (None, None)
  836. if cur_state == 'Metadata':
  837. # We're looking for an event. Actually we better find one.
  838. m = EVENT_RE.match(line)
  839. if not m:
  840. #raise "Missing event on line: %s" % line
  841. continue
  842. else:
  843. event_time = float(m.group(1))
  844. ip = m.group(2)
  845. port = int(m.group(3))
  846. ip_port = m.group(2)+":"+m.group(3)
  847. clean_host = m.group(4)
  848. event_type = m.group(5)
  849. query_str = ''
  850. cur_state = 'Query'
  851. elif cur_state == 'Query':
  852. if not SECTION_RE.match(line):
  853. query_str += line
  854. else:
  855. # We're done
  856. # Generate the event to return
  857. # Track the connection if we don't know about it yet.
  858. conn = self.createConnection(ip_port, event_time)
  859. if event_type == 'QueryStart':
  860. conn.queryStartProcessed(ip, port, event_time, query_str)
  861. return ("QueryStart", conn.mLastQuery)
  862. elif event_type == 'QueryResponse':
  863. # Update the status of this connection, including query times on
  864. # connections
  865. # Hack: Result type defaults to zero
  866. if conn.updateResponse(event_time, 0):
  867. # Event: Initial query response
  868. return ("QueryResponse", conn.mLastQuery)
  869. else:
  870. # Skip responses which we don't have the start for
  871. cur_state = 'Metadata'
  872. elif event_type == 'Quit':
  873. # Event: Quitting a connection
  874. conn.quit(ip, port, event_time)
  875. # This connection is closing, get rid of it
  876. self.closeConnection(ip_port)
  877. return ("Quit", conn.mLastQuery)
  878. else:
  879. raise ("Unknown event type %s" % event_type)
  880. return (None, None)
  881. def start_dump(host, port):
  882. # Start up tcpdump pushing data into netcat on the sql server
  883. interface = "eth0"
  884. # Start up tcpdump pushing data into netcat on the sql server
  885. SRC_DUMP_CMD = "ssh root@%s '/usr/sbin/tcpdump -p -n -s 0 -w - -i %s dst port 3306 or src port 3306 | nc %s %d'" \
  886. % (host, interface, socket.getfqdn(), port)
  887. os.popen2(SRC_DUMP_CMD, "r")
  888. def remote_mysql_stream(host):
  889. # Create a server socket, then have tcpdump dump stuff to it.
  890. serversocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  891. bound = False
  892. port = 9999
  893. while not bound:
  894. try:
  895. serversocket.bind((socket.gethostname(), port))
  896. bound = True
  897. except:
  898. print port, " already bound, trying again"
  899. port += 1
  900. print "Bound port %d" % port
  901. serversocket.listen(1)
  902. # Fork off the dumper, start the server on the main connection
  903. pid = os.fork()
  904. if not pid:
  905. # Child process which gets data from the database
  906. time.sleep(1.0)
  907. print "Starting dump!"
  908. start_dump(host, port)
  909. print "Exiting dump!"
  910. sys.exit(0)
  911. print "Starting server"
  912. (clientsocket, address) = serversocket.accept()
  913. print "Accepted connection", address
  914. # Start listening to the data stream
  915. return clientsocket.makefile("rb")
  916. #
  917. # Utility stuff for query cleaner
  918. #
  919. # This is a Python port of (part of) the fingerprint() function from
  920. # the mk-query-digest script in Maatkit, added by Yoz, with various additions/tweaks
  921. hex_wildcard = r"[0-9a-f]"
  922. word = hex_wildcard + r"{4}-"
  923. long_word = hex_wildcard + r"{8}-"
  924. very_long_word = hex_wildcard + r"{12}"
  925. UUID_REGEX_STRING = long_word + word + word + word + very_long_word
  926. hex_re = re.compile("^[\da-f]+$",re.I)
  927. uuid_re = re.compile("^"+UUID_REGEX_STRING+"$",re.I)
  928. def string_replace(match):
  929. "Called by string-matching regexp in replacers"
  930. if uuid_re.match(match.group(1)):
  931. return "*uuid*"
  932. return "*string*"
  933. # list of (match,replacement) tuples used by clean_query()
  934. replacers = [
  935. # Disabling comment removal because we may put useful inspection info in there
  936. #(re.compile(r'(?:--|#)[^\'"\r\n]*(?=[\r\n]|\Z)',re.I),""), # one-line comments
  937. #(re.compile(r"/\*[^!].*?\*/",re.I|re.M|re.S),""), # But not /*!version */
  938. (re.compile(r"\\\\"),""), # remove backslash pairs that may confuse the next line
  939. (re.compile(r"\\[\"']"),""), # remove escaped quotes
  940. (re.compile(r'"([^"]*)"',re.I),string_replace), # quoted strings
  941. (re.compile(r"'([^']*)'",re.I),string_replace), # quoted strings
  942. # this next one may need more work, due to "UPDATE ... SET money = money-23"
  943. # the next two are significantly different from the maatkit original code
  944. (re.compile(r"(?<![\w\)\d])(\s*)\-\d+(\.\d+)?",re.I),"*num*"), # negative reals
  945. (re.compile(r"(?<![\w])\d+(\.\d+)?",re.I),"*num*"), # positive reals
  946. # mk-query-digest has s/[xb.+-]\?/?/g; as "clean up leftovers" here, whatever that means - I've left it out
  947. (re.compile(r"^\s+",re.I),""), # chop off leading whitespace
  948. (re.compile(r"\s+$",re.I|re.M|re.S),""), # kill trailing whitespace
  949. # reduce IN and VALUES lists (look for previously-cleaned placeholders)
  950. (re.compile(r"\b(in|values)(?:[\s,]*\(([\s\,]*\*(num|string|uuid)\*)*[\s,]*\))+",
  951. re.I|re.X),"\\1(*values*)"), # collapse IN and VALUES lists
  952. # This next one collapses chains of UNIONed functionally-identical queries,
  953. # but it's only really useful if you're regularly seeing more than 2 queries
  954. # in a chain. We don't seem to have any like that, so I'm disabling this.
  955. #(re.compile(r"\b(select\s.*?)(?:(\sunion(?:\sall)?)\s\1)+",re.I),"\\1 -- repeat\\2 --"), # collapse UNION
  956. # remove "OFFSET *num*" when following a LIMIT
  957. (re.compile(r"\blimit \*num\*(?:, ?\*num\*| offset \*num\*)?",re.I),"LIMIT *num*")
  958. ]
  959. prepare_re = re.compile('PREPARE.*', re.IGNORECASE)
  960. deallocate_re = re.compile('DEALLOCATE\s+PREPARE.*', re.IGNORECASE)
  961. execute_re = re.compile('EXECUTE.*', re.IGNORECASE)
  962. mdb_re = re.compile('MDB2_STATEMENT\S+')
  963. def clean_query(query, num_words):
  964. "Generalizes a query by removing all unique information"
  965. # Strip carriage returns
  966. query = query.replace("\n", " ")
  967. # Screw it, if it's a prepared statement or an execute, generalize the statement name
  968. if prepare_re.match(query):
  969. query = mdb_re.sub('*statement*', query)
  970. return query
  971. if execute_re.match(query):
  972. query = mdb_re.sub('*statement*', query)
  973. if deallocate_re.match(query):
  974. query = "DEALLOCATE PREPARE"
  975. return query
  976. # Loop through the replacers and perform each one
  977. for (replacer, subst) in replacers:
  978. # try block is here because, apparently, string_re may throw an exception
  979. # TODO: investigate the above
  980. try:
  981. query = replacer.sub(subst, query)
  982. except:
  983. pass
  984. # After we do the cleanup, then we get rid of extra whitespace
  985. words = query.split(None)
  986. query = " ".join(words)
  987. return query
  988. def test_clean_query(query):
  989. "A debug version of the query cleaner which prints steps as it goes"
  990. # Strip carriage returns
  991. query = query.replace("\n", " ")
  992. # Screw it, if it's a prepared statement or an execute, generalize the statement name
  993. if prepare_re.match(query):
  994. query = mdb_re.sub('*statement*', query)
  995. return query
  996. if execute_re.match(query):
  997. query = mdb_re.sub('*statement*', query)
  998. if deallocate_re.match(query):
  999. query = "DEALLOCATE PREPARE"
  1000. return query
  1001. # Loop through the replacers and perform each one
  1002. for (replacer, subst) in replacers:
  1003. try:
  1004. if replacer.search(query) == None:
  1005. print replacer.pattern," : No match"
  1006. else:
  1007. query = replacer.sub(subst, query)
  1008. print replacer.pattern," : ",query
  1009. except:
  1010. pass
  1011. # After we do the cleanup, then we get rid of extra whitespace
  1012. words = query.split(None)
  1013. query = " ".join(words)
  1014. return query
  1015. #
  1016. # Hostname cache - basically, caches the "linden" host type for a particular IP address
  1017. # or hostname
  1018. #
  1019. sim_re = re.compile(".*sim\d+.*")
  1020. web_re = re.compile("int\.web\d+.*")
  1021. iweb_re = re.compile("int\.iweb\d+.*")
  1022. webds_re = re.compile(".*web-ds\d+.*")
  1023. webster_re = re.compile(".*webster\d+.*")
  1024. bankds_re = re.compile(".*bank-ds\d+.*")
  1025. xmlrpc_re = re.compile(".*xmlrpc\d+.*")
  1026. login_re = re.compile(".*login\d+.*")
  1027. data_re = re.compile(".*data\..*")
  1028. #xmlrpc_re = re.compile("(?:int\.omgiwanna.*)|(?:int\.pony.*)")
  1029. ip_re = re.compile("\d+\.\d+\.\d+\.\d+")
  1030. ll_re = re.compile("(.*)\.lindenlab\.com")
  1031. host_type_cache = {}
  1032. def get_host_type(host):
  1033. "Returns the genericized linden host type from an IP address or hostname"
  1034. # if host in host_type_cache:
  1035. # return host_type_cache[host]
  1036. named_host = str(host)
  1037. if ip_re.match(host):
  1038. # Look up the hostname
  1039. try:
  1040. named_host = str(socket.gethostbyaddr(host)[0])
  1041. except:
  1042. pass
  1043. # Figure out generic host type
  1044. host_type = named_host
  1045. if sim_re.match(named_host):
  1046. host_type = "sim"
  1047. elif login_re.match(named_host):
  1048. host_type = "login"
  1049. elif webster_re.match(named_host):
  1050. host_type = "webster"
  1051. elif bankds_re.match(named_host):
  1052. host_type = "bank-ds"
  1053. elif web_re.match(named_host):
  1054. host_type = "web"
  1055. elif iweb_re.match(named_host):
  1056. host_type = "iweb"
  1057. elif webds_re.match(named_host):
  1058. host_type = "web-ds"
  1059. elif data_re.match(named_host):
  1060. host_type = "data"
  1061. elif xmlrpc_re.match(named_host):
  1062. host_type = "xmlrpc"
  1063. m = ll_re.match(host_type)
  1064. if m:
  1065. host_type = m.group(1)
  1066. host_type_cache[host] = host_type
  1067. return (host_type, named_host)
  1068. def LLLogIter(filenames):
  1069. "An iterator that iterates line by line over a series of files, even if they're compressed."
  1070. for f in filenames:
  1071. curr = open_log_file(f)
  1072. for line in curr:
  1073. yield line
  1074. def open_log_file(filename):
  1075. # Open the logfile (even if it's compressed)
  1076. if re.compile(".+\.gz").match(filename):
  1077. # gzipped file, return a gzipped file opject
  1078. return gzip.open(filename,"r")
  1079. else:
  1080. return open(filename, "r")