/python/engine/PinYin/annotate.py
http://scim-python.googlecode.com/ · Python · 46 lines · 21 code · 3 blank · 22 comment · 4 complexity · 16d0aaa780d2a5246d3e52aadf58c0b3 MD5 · raw file
- import ZhengJu
- editor = ZhengJu.Editor()
- i=0
- count = 0
- for l in file ("google_annotated_uniq.txt"):
- ll = unicode(l,"utf8")
- p = (ll).split(u"\t")
- #~ print p[0]
- if len(p[0])==1:
- count += 1
- i += int(p[2])
- #~ try:
- #~ editor.reverse(p[0])
- #~ pinyin = "'".join (map (str, editor.wordlist))
- #~ print (p[0] + "\t" + pinyin + "\t" + p[1]).encode ("utf-8"),
- #~ except:
- #~ pass
- print i,count
- #~ for l in file ("google.txt"):
- #~ ll = unicode(l,"utf8")
- #~ p = (ll).split(u"\t")
- #~ print p[0]
- #~ if len(p[0])==1:
- #~ count += 1
- #~ i += int(p[1])
- #~ print p[0].encode ("utf-8")
- #~ try:
- #~ editor.reverse(p[0])
- #~ pinyin = "'".join (map (str, editor.wordlist))
- #~ print (p[0] + "\t" + pinyin + "\t" + p[1]).encode ("utf-8"),
- #~ except:
- #~ pass
- #~ print i,count
- import sets as Set
- i=0
- count = 0
- s = Set.Set()
- for l in file ("google_annotated.txt"):
- ll = unicode(l,"utf8")
- p = (ll).split(u"\t")
- if p[0] not in s:
- s.add(p[0])
- print ll.encode ("utf-8"),