counter.py | searchcode

/tools/counter/counter.py

https://github.com/ChuguluGames/mediawiki-svn
Python | 127 lines | 87 code | 16 blank | 24 comment | 19 complexity | 8582344eae7211895410ef0e21720b44 MD5 | raw file

#!/usr/bin/python

#Page view counter
#	Reads squid logs (https://wikitech.leuksman.com/view/Squid_log_format)
#	Normalizes page name, aggregates them for a configurable time window, shoves the 
#	aggregates into a database.
# Usage: ./counter.py [list of allowed pages] < logfile
# Be sure sampleHits is set correctly

#Notes:
# * Requires pyjudy (http://www.dalkescientific.com/Python/PyJudy.html)
#   (python dicts and sets use too much darn memory)
# * The final incomplete aggregation window is discarded.
# * Fixed aggregation windows that align to time of day may be more useful than the current
#   behavior.

import MySQLdb
import re
import sys
import urllib
import time
import pyjudy

sampleHits = 100    # Number of hits to record per sample
aggThresh = 3600  # Number of sample seconds needed to trigger a data export

globalConnection = None
aggCounter = pyjudy.JudySLInt()
aggRange = (sys.maxint,0)

def runLoop(inputFile, targetPages=None):
	for line in inputFile:
		# Skip lines that are just going to be hitting the upload server
		# or common skin files
		if line.find(" GET http://upload.wikimedia.org/") == -1 \
			and line.find(".org/skins-1.5/") == -1:
			page,timestamp = extractPage(line)
			if page and (targetPages == None or page in targetPages):
				recordHit(page,timestamp)
	closeConnection()

def extractPage(line):
	# Extract the page name from the URL.
	# A check should probably be placed here to toss requests with
	# page names larger than the maximum length.
	url,timestamp = extractUrl(line)
	if url and \
			"?" not in url and \
			url[0:7] == "http://":
		bits = url[7:].split("/", 2)
		if len(bits) == 3 and bits[1] == "wiki":
			host = bits[0]
			page = normalizePage(bits[2])
			return (host + ":" + page, timestamp)
	return None

def extractUrl(line):
	# https://wikitech.leuksman.com/view/Squid_log_format
	# $hostname %sn %ts.%03tu %tr %>a %Ss/%03Hs %<st %rm %ru %Sh/%<A %mt %{Referer}>h %{X-Forwarded-For}>h %{User-Agent}>h
	# ...
	# 3. Seconds (and milliseconds) since epoch
	# ...
	# 9. URL
	bits = line.split(" ", 10)
	if len(bits) > 9 and bits[8] == "GET":
		return (bits[9],int(round(float(bits[3]))))
	else:
		return None

def normalizePage(page):
	return urllib.unquote(page).replace("_", " ")

def recordHit(page,timestamp):
	global aggCounter
	global aggRange
	global aggThresh

	if (max(timestamp,aggRange[1])-aggRange[0] >= aggThresh):
		for item in aggCounter.items():
			(site, pagename) = item[0].split(":", 1)
			conn = getConnection()
			conn.cursor().execute(
				"INSERT INTO hit_counter (hc_tsstart, hc_tsend, hc_site, hc_page, hc_count) VALUES (%s, %s, %s, %s, %s)",
				(time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(aggRange[0])),time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime(aggRange[1])),site, pagename, item[1]))
			conn.commit()
		aggRange=(aggRange[1],aggRange[1])
		aggCounter.FreeArray()	
	
	if page in aggCounter:
		aggCounter[page] += sampleHits
	else:
		aggCounter[page] = sampleHits
	aggRange=(min(timestamp,aggRange[0]),max(timestamp,aggRange[1]))
	
	

def getConnection():
	global globalConnection
	if not globalConnection:
		globalConnection = openConnection()
	return globalConnection

def openConnection():
	return MySQLdb.connect(host="localhost", user="root", passwd="", db="counter")

def closeConnection():
	global globalConnection
	if globalConnection:
		globalConnection.close()
		globalConnection = None

def setFromFile(filename):
	"""Read list of lines from a file"""
	infile = open(filename)
	out = pyjudy.JudySLInt()
	for line in infile:
		if line.strip()!="":
			out.Ins(line.strip(),1)
	infile.close()
	return out

if __name__ == "__main__":
	if len(sys.argv) > 1:
		targetPages = setFromFile(sys.argv[1])
		runLoop(sys.stdin, targetPages)
	else:
		runLoop(sys.stdin)