PageRenderTime 28ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/python/engine/PinYin/tools/gphrase.py

http://scim-python.googlecode.com/
Python | 64 lines | 54 code | 3 blank | 7 comment | 2 complexity | 2c27aebfe3bae04817cd8ec2f3a86925 MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import urllib2
  4. import urllib
  5. import re
  6. import sys
  7. import time
  8. import traceback
  9. #from urlgrabber.keepalive import HTTPHandler
  10. SEARCH_URL = "http://www.google.com/search?hl=en&lr=&as_qdr=all&%s&btnG=Search"
  11. #<b>1,060</b> for <b>allintext:
  12. RE = re.compile ("<b>([0-9\,]+)</b> for <b>")
  13. #keepalive_handler = HTTPHandler ()
  14. #opener = urllib2.build_opener(keepalive_handler)
  15. #urllib2.install_opener(opener)
  16. def get_search_result (keyword):
  17. params = urllib.urlencode ({"q": "\"%s\"" % keyword})
  18. url = SEARCH_URL % params
  19. req = urllib2.Request (url)
  20. req.add_header ('User-agent', 'Mozilla/5.0')
  21. req.add_header ('Host', 'www.google.com')
  22. f = urllib2.urlopen (req)
  23. for l in f:
  24. print l
  25. m = RE.findall (l)
  26. if m:
  27. freq = int (m[0].replace (",", ""))
  28. return freq
  29. return 0
  30. def process_phrase_file (name):
  31. phrases = []
  32. phrases_dict = {}
  33. for l in open (name):
  34. phrases.append (unicode (l.strip(), "utf8"))
  35. i = 0
  36. while len (phrases) != len (phrases_dict):
  37. for p in phrases:
  38. if p in phrases_dict:
  39. continue
  40. try:
  41. phrases_dict[p] = get_search_result (p.encode ("utf8"))
  42. i += 1
  43. print "%d%%" % (i * 100 / len (phrases))
  44. except:
  45. traceback.print_exc ()
  46. line = u"Search %s failed" % p
  47. print >> sys.stderr, line
  48. output = file (name + ".out", "w")
  49. for p in phrases:
  50. line = u"%s\t%d" % (p, phrases_dict[p])
  51. print >>output, line.encode ("utf8")
  52. if __name__ == "__main__":
  53. for keyword in sys.argv[1:]:
  54. print get_search_result (keyword)
  55. #for filename in sys.argv[1:]:
  56. # process_phrase_file (filename)