/python/engine/PinYin/annotate.py

http://scim-python.googlecode.com/ · Python · 46 lines · 21 code · 3 blank · 22 comment · 4 complexity · 16d0aaa780d2a5246d3e52aadf58c0b3 MD5 · raw file

  1. import ZhengJu
  2. editor = ZhengJu.Editor()
  3. i=0
  4. count = 0
  5. for l in file ("google_annotated_uniq.txt"):
  6. ll = unicode(l,"utf8")
  7. p = (ll).split(u"\t")
  8. #~ print p[0]
  9. if len(p[0])==1:
  10. count += 1
  11. i += int(p[2])
  12. #~ try:
  13. #~ editor.reverse(p[0])
  14. #~ pinyin = "'".join (map (str, editor.wordlist))
  15. #~ print (p[0] + "\t" + pinyin + "\t" + p[1]).encode ("utf-8"),
  16. #~ except:
  17. #~ pass
  18. print i,count
  19. #~ for l in file ("google.txt"):
  20. #~ ll = unicode(l,"utf8")
  21. #~ p = (ll).split(u"\t")
  22. #~ print p[0]
  23. #~ if len(p[0])==1:
  24. #~ count += 1
  25. #~ i += int(p[1])
  26. #~ print p[0].encode ("utf-8")
  27. #~ try:
  28. #~ editor.reverse(p[0])
  29. #~ pinyin = "'".join (map (str, editor.wordlist))
  30. #~ print (p[0] + "\t" + pinyin + "\t" + p[1]).encode ("utf-8"),
  31. #~ except:
  32. #~ pass
  33. #~ print i,count
  34. import sets as Set
  35. i=0
  36. count = 0
  37. s = Set.Set()
  38. for l in file ("google_annotated.txt"):
  39. ll = unicode(l,"utf8")
  40. p = (ll).split(u"\t")
  41. if p[0] not in s:
  42. s.add(p[0])
  43. print ll.encode ("utf-8"),