/python/engine/PinYin/tools/gphrase.py
Python | 64 lines | 54 code | 3 blank | 7 comment | 2 complexity | 2c27aebfe3bae04817cd8ec2f3a86925 MD5 | raw file
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import urllib2
- import urllib
- import re
- import sys
- import time
- import traceback
- #from urlgrabber.keepalive import HTTPHandler
- SEARCH_URL = "http://www.google.com/search?hl=en&lr=&as_qdr=all&%s&btnG=Search"
- #<b>1,060</b> for <b>allintext:
- RE = re.compile ("<b>([0-9\,]+)</b> for <b>")
- #keepalive_handler = HTTPHandler ()
- #opener = urllib2.build_opener(keepalive_handler)
- #urllib2.install_opener(opener)
- def get_search_result (keyword):
- params = urllib.urlencode ({"q": "\"%s\"" % keyword})
- url = SEARCH_URL % params
- req = urllib2.Request (url)
- req.add_header ('User-agent', 'Mozilla/5.0')
- req.add_header ('Host', 'www.google.com')
- f = urllib2.urlopen (req)
- for l in f:
- print l
- m = RE.findall (l)
- if m:
- freq = int (m[0].replace (",", ""))
- return freq
- return 0
- def process_phrase_file (name):
- phrases = []
- phrases_dict = {}
- for l in open (name):
- phrases.append (unicode (l.strip(), "utf8"))
- i = 0
- while len (phrases) != len (phrases_dict):
- for p in phrases:
- if p in phrases_dict:
- continue
- try:
- phrases_dict[p] = get_search_result (p.encode ("utf8"))
- i += 1
- print "%d%%" % (i * 100 / len (phrases))
- except:
- traceback.print_exc ()
- line = u"Search %s failed" % p
- print >> sys.stderr, line
-
- output = file (name + ".out", "w")
- for p in phrases:
- line = u"%s\t%d" % (p, phrases_dict[p])
- print >>output, line.encode ("utf8")
-
- if __name__ == "__main__":
- for keyword in sys.argv[1:]:
- print get_search_result (keyword)
- #for filename in sys.argv[1:]:
- # process_phrase_file (filename)