/python/engine/PinYin/tools/phrase_filter.py
http://scim-python.googlecode.com/ · Python · 28 lines · 24 code · 4 blank · 0 comment · 7 complexity · d6668bb1bd225e986ba01842887fa295 MD5 · raw file
- import sys
- sys.path.append ("..")
- import bz2
- def load_phrase (_file):
- tmp = []
- for l in _file:
- phrase = unicode (l, "utf8").strip().split()[0]
- tmp.append (phrase)
- return tmp
- def main (filenames, in_file):
- phrase_dict = set([])
- for filename in filenames:
- tmp = load_phrase (bz2.BZ2File (filename, "r"))
- phrase_dict |= set(tmp)
- for line in sys.stdin:
- line = unicode (line, "utf8").strip ()
- phrase = line.split ()[0]
- if (phrase in phrase_dict) == in_file:
- print line.encode ("utf8")
- if __name__ == "__main__":
- if sys.argv[1] == "-v":
- main (sys.argv[2:], False)
- else:
- main (sys.argv[1:], True)