gphrase.py - This Python script retrieves search results fo…

/python/engine/PinYin/tools/gphrase.py

http://scim-python.googlecode.com/ · Python · 64 lines · 47 code · 8 blank · 9 comment · 10 complexity · 2c27aebfe3bae04817cd8ec2f3a86925 MD5 · raw file


#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import urllib
import re
import sys
import time
import traceback
#from urlgrabber.keepalive import HTTPHandler

SEARCH_URL = "http://www.google.com/search?hl=en&lr=&as_qdr=all&%s&btnG=Search"
#<b>1,060</b> for <b>allintext:
RE = re.compile ("<b>([0-9\,]+)</b> for <b>")

#keepalive_handler = HTTPHandler ()
#opener = urllib2.build_opener(keepalive_handler)
#urllib2.install_opener(opener)

def get_search_result (keyword):
	params = urllib.urlencode ({"q": "\"%s\"" % keyword})
	url = SEARCH_URL % params
	req = urllib2.Request (url)
	req.add_header ('User-agent', 'Mozilla/5.0')
	req.add_header ('Host', 'www.google.com')
	f = urllib2.urlopen (req)
	for l in f:
		print l
		m = RE.findall (l)
		if m:
			freq = int (m[0].replace (",", ""))
			return freq
	return 0

def process_phrase_file (name):
	phrases = []
	phrases_dict = {}
	for l in open (name):
		phrases.append (unicode (l.strip(), "utf8"))
	i = 0
	while len (phrases) != len (phrases_dict):
		for p in phrases:
			if p in phrases_dict:
				continue
			try:
				phrases_dict[p] = get_search_result (p.encode ("utf8"))
				i += 1
				print "%d%%" % (i * 100 / len (phrases))
			except:
				traceback.print_exc ()
				line = u"Search %s failed" % p
				print >> sys.stderr, line
	
	output = file (name + ".out", "w")

	for p in phrases:
		line = u"%s\t%d" % (p, phrases_dict[p])
		print >>output, line.encode ("utf8")
		

if __name__ == "__main__":
	for keyword in sys.argv[1:]:
		print get_search_result (keyword)
	#for filename in sys.argv[1:]:
	#	process_phrase_file (filename)

Summary ✨

This Python script retrieves search results for a given keyword from Google and stores them in a file. It takes one or more keywords as command-line arguments, processes each keyword by sending a GET request to Google’s search engine, parsing the response, and extracting the frequency of occurrences. The result is written to an output file with the original phrase and its corresponding frequency.

Tech Fingerprint

Alerts (5)

'def' Ensure functions have docstrings for documentation
34
Complexity hotspot; lines 40 to 42 (total complexity: 3)
40 41 42
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
48