PageRenderTime 33ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/tools/counter/counter.py

https://github.com/ChuguluGames/mediawiki-svn
Python | 127 lines | 87 code | 16 blank | 24 comment | 19 complexity | 8582344eae7211895410ef0e21720b44 MD5 | raw file
  1. #!/usr/bin/python
  2. #Page view counter
  3. # Reads squid logs (https://wikitech.leuksman.com/view/Squid_log_format)
  4. # Normalizes page name, aggregates them for a configurable time window, shoves the
  5. # aggregates into a database.
  6. # Usage: ./counter.py [list of allowed pages] < logfile
  7. # Be sure sampleHits is set correctly
  8. #Notes:
  9. # * Requires pyjudy (http://www.dalkescientific.com/Python/PyJudy.html)
  10. # (python dicts and sets use too much darn memory)
  11. # * The final incomplete aggregation window is discarded.
  12. # * Fixed aggregation windows that align to time of day may be more useful than the current
  13. # behavior.
  14. import MySQLdb
  15. import re
  16. import sys
  17. import urllib
  18. import time
  19. import pyjudy
  20. sampleHits = 100 # Number of hits to record per sample
  21. aggThresh = 3600 # Number of sample seconds needed to trigger a data export
  22. globalConnection = None
  23. aggCounter = pyjudy.JudySLInt()
  24. aggRange = (sys.maxint,0)
  25. def runLoop(inputFile, targetPages=None):
  26. for line in inputFile:
  27. # Skip lines that are just going to be hitting the upload server
  28. # or common skin files
  29. if line.find(" GET http://upload.wikimedia.org/") == -1 \
  30. and line.find(".org/skins-1.5/") == -1:
  31. page,timestamp = extractPage(line)
  32. if page and (targetPages == None or page in targetPages):
  33. recordHit(page,timestamp)
  34. closeConnection()
  35. def extractPage(line):
  36. # Extract the page name from the URL.
  37. # A check should probably be placed here to toss requests with
  38. # page names larger than the maximum length.
  39. url,timestamp = extractUrl(line)
  40. if url and \
  41. "?" not in url and \
  42. url[0:7] == "http://":
  43. bits = url[7:].split("/", 2)
  44. if len(bits) == 3 and bits[1] == "wiki":
  45. host = bits[0]
  46. page = normalizePage(bits[2])
  47. return (host + ":" + page, timestamp)
  48. return None
  49. def extractUrl(line):
  50. # https://wikitech.leuksman.com/view/Squid_log_format
  51. # $hostname %sn %ts.%03tu %tr %>a %Ss/%03Hs %<st %rm %ru %Sh/%<A %mt %{Referer}>h %{X-Forwarded-For}>h %{User-Agent}>h
  52. # ...
  53. # 3. Seconds (and milliseconds) since epoch
  54. # ...
  55. # 9. URL
  56. bits = line.split(" ", 10)
  57. if len(bits) > 9 and bits[8] == "GET":
  58. return (bits[9],int(round(float(bits[3]))))
  59. else:
  60. return None
  61. def normalizePage(page):
  62. return urllib.unquote(page).replace("_", " ")
  63. def recordHit(page,timestamp):
  64. global aggCounter
  65. global aggRange
  66. global aggThresh
  67. if (max(timestamp,aggRange[1])-aggRange[0] >= aggThresh):
  68. for item in aggCounter.items():
  69. (site, pagename) = item[0].split(":", 1)
  70. conn = getConnection()
  71. conn.cursor().execute(
  72. "INSERT INTO hit_counter (hc_tsstart, hc_tsend, hc_site, hc_page, hc_count) VALUES (%s, %s, %s, %s, %s)",
  73. (time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(aggRange[0])),time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(aggRange[1])),site, pagename, item[1]))
  74. conn.commit()
  75. aggRange=(aggRange[1],aggRange[1])
  76. aggCounter.FreeArray()
  77. if page in aggCounter:
  78. aggCounter[page] += sampleHits
  79. else:
  80. aggCounter[page] = sampleHits
  81. aggRange=(min(timestamp,aggRange[0]),max(timestamp,aggRange[1]))
  82. def getConnection():
  83. global globalConnection
  84. if not globalConnection:
  85. globalConnection = openConnection()
  86. return globalConnection
  87. def openConnection():
  88. return MySQLdb.connect(host="localhost", user="root", passwd="", db="counter")
  89. def closeConnection():
  90. global globalConnection
  91. if globalConnection:
  92. globalConnection.close()
  93. globalConnection = None
  94. def setFromFile(filename):
  95. """Read list of lines from a file"""
  96. infile = open(filename)
  97. out = pyjudy.JudySLInt()
  98. for line in infile:
  99. if line.strip()!="":
  100. out.Ins(line.strip(),1)
  101. infile.close()
  102. return out
  103. if __name__ == "__main__":
  104. if len(sys.argv) > 1:
  105. targetPages = setFromFile(sys.argv[1])
  106. runLoop(sys.stdin, targetPages)
  107. else:
  108. runLoop(sys.stdin)