PageRenderTime 7ms CodeModel.GetById 1ms app.highlight 4ms RepoModel.GetById 1ms app.codeStats 0ms

/python/engine/PinYin/tools/gphrase.py

http://scim-python.googlecode.com/
Python | 64 lines | 54 code | 3 blank | 7 comment | 3 complexity | 2c27aebfe3bae04817cd8ec2f3a86925 MD5 | raw file
 1#!/usr/bin/env python
 2# -*- coding: utf-8 -*-
 3import urllib2
 4import urllib
 5import re
 6import sys
 7import time
 8import traceback
 9#from urlgrabber.keepalive import HTTPHandler
10
11SEARCH_URL = "http://www.google.com/search?hl=en&lr=&as_qdr=all&%s&btnG=Search"
12#<b>1,060</b> for <b>allintext:
13RE = re.compile ("<b>([0-9\,]+)</b> for <b>")
14
15#keepalive_handler = HTTPHandler ()
16#opener = urllib2.build_opener(keepalive_handler)
17#urllib2.install_opener(opener)
18
19def get_search_result (keyword):
20	params = urllib.urlencode ({"q": "\"%s\"" % keyword})
21	url = SEARCH_URL % params
22	req = urllib2.Request (url)
23	req.add_header ('User-agent', 'Mozilla/5.0')
24	req.add_header ('Host', 'www.google.com')
25	f = urllib2.urlopen (req)
26	for l in f:
27		print l
28		m = RE.findall (l)
29		if m:
30			freq = int (m[0].replace (",", ""))
31			return freq
32	return 0
33
34def process_phrase_file (name):
35	phrases = []
36	phrases_dict = {}
37	for l in open (name):
38		phrases.append (unicode (l.strip(), "utf8"))
39	i = 0
40	while len (phrases) != len (phrases_dict):
41		for p in phrases:
42			if p in phrases_dict:
43				continue
44			try:
45				phrases_dict[p] = get_search_result (p.encode ("utf8"))
46				i += 1
47				print "%d%%" % (i * 100 / len (phrases))
48			except:
49				traceback.print_exc ()
50				line = u"Search %s failed" % p
51				print >> sys.stderr, line
52	
53	output = file (name + ".out", "w")
54
55	for p in phrases:
56		line = u"%s\t%d" % (p, phrases_dict[p])
57		print >>output, line.encode ("utf8")
58		
59
60if __name__ == "__main__":
61	for keyword in sys.argv[1:]:
62		print get_search_result (keyword)
63	#for filename in sys.argv[1:]:
64	#	process_phrase_file (filename)