/demo.py
Python | 177 lines | 170 code | 5 blank | 2 comment | 0 complexity | 4fddfca5e4113bb91178a91cbf76fc0f MD5 | raw file
- # -*- coding: utf-8 -*-
- import os
- import sys
- from datetime import datetime
- import numpy as np
- import pandas as pd
- import mail
- reload(sys)
- sys.setdefaultencoding("utf-8")
- mail_globals = mail.mail_globals
- global_dir = mail_globals.data_dir
- lang_cn, lang_en = mail_globals.MAIL_LANG_CN, mail_globals.MAIL_LANG_EN
- def prepare_data(date_zone, file_format='pkl'):
- """
- :param date_zone: "7-20"
- :param file_format: ["csv", "json", "bz2", "pkl"]
- :return:
- """
- path_prefix = global_dir + "/2015-12-%s"
- st, ed = map(int, date_zone.split('-'))
- mail_files = [path_prefix % str(i).zfill(2) for i in range(st, ed + 1)]
- for mail_file in mail_files:
- saved_file = '%s_token.%s' % (mail_file, file_format)
- # load mails
- mail_datas = mail.mail_io_load_file(mail_file)
- mail_datas = mail.parallel_preprocess(mail_datas)
- mail_sets = mail.MailSets(mail_datas)
- del mail_datas
- print "save file: ", saved_file
- mail_sets.dump(saved_file, file_format=file_format)
- def prepare_basic_model(date_zone, file_format='pkl'):
- from mail_preprocess import parallel_preprocess_v2
- path_prefix = global_dir + "/2015-12-%s_token." + file_format
- st, ed = map(int, date_zone.split('-'))
- mail_files = [path_prefix % str(i).zfill(2) for i in range(st, ed + 1)]
- mail_sets_frames = []
- for mail_file in mail_files:
- mail_globals.logger.debug("load mail file: %s" % mail_file)
- if not os.path.exists(mail_file):
- mail_globals.logger.warn("not found file: %s" % mail_file)
- continue
- # load mails
- mail_sets = mail.MailSets()
- mail_sets.load(mail_file, file_format=file_format)
- if mail_sets.length() == 0:
- continue
- mail_globals.logger.debug("ignore duplicate mails=%s" % mail_file)
- mail_sets = mail.ignore_duplicate_mails(mail_sets)
- # remove signature
- mail_sets = mail.remove_mail_signature(mail_sets)
- mail_sets = mail.mail_parallel_preprocess_v2(mail_sets)
- mail_sets_frames.append(mail_sets.df)
- # del mail_sets
- mail_sets = mail.MailSets(pd.concat(mail_sets_frames))
- del mail_sets_frames
- # build mail tfidf feature model
- mail_sets_cn = mail.filter_mails_by_lang(mail_sets, lang=lang_cn)
- mail_globals.timeit_v2(lambda: mail.build_feature_model(mail_sets_cn, lang=lang_cn, drop_dup=False),
- desc="build feature model, mails=%d, lang=%s" % (mail_sets_cn.length(), lang_cn))
- # del mail_sets_cn
- mail_sets_en = mail.filter_mails_by_lang(mail_sets, lang=lang_en)
- mail_globals.timeit_v2(lambda: mail.build_feature_model(mail_sets_en, lang=lang_en, drop_dup=False),
- desc="build feature model, mails=%d, lang=%s" % (mail_sets_en.length(), lang_en))
- # del mail_sets_en
- mail_globals.feature.dump()
- # build word2vec model
- mail_globals.timeit_v2(lambda: mail.build_word2vec_model(mail_sets),
- desc="build word2vec model, mails=%d" % mail_sets.length())
- mail_globals.word2vec.dump()
- def topic_analysis(date_zone, top_k=20, file_format='pkl'):
- path_prefix = mail_globals.data_dir + "/2015-12-%s_token." + file_format
- st, ed = map(int, date_zone.split('-'))
- mail_files = [path_prefix % str(i).zfill(2) for i in range(st, ed + 1)]
- for mail_file in mail_files:
- result_file = '%s_topic_analysis.csv' % mail_file
- # load mails
- mail_sets = mail.MailSets()
- mail_sets.load(mail_file, file_format=file_format)
- # print "partial fit word2vec"
- # mail.build_word2vec_model(mail_sets)
- # remove signature
- flag_time_1 = datetime.now()
- mail_sets = mail.remove_mail_signature(mail_sets)
- flag_time_2 = datetime.now()
- mail_globals.logger.info(
- "remove mail signature performance: mails=%d, time=%d" % (
- mail_sets.length(), (flag_time_2 - flag_time_1).seconds))
- # partial fit topic
- flag_time_1 = datetime.now()
- mail.build_topic_model(mail_sets)
- flag_time_2 = datetime.now()
- mail_globals.logger.info(
- "build topic model performance: mails=%d, time=%d" % (
- mail_sets.length(), (flag_time_2 - flag_time_1).seconds))
- # analysis
- sub_topic_analysis(result_file, top_k=top_k)
- def sub_topic_analysis(output, top_k=10):
- from mail_topic import TopicMining, check_topic_duplicate
- print "max topic number", TopicMining.MAX_TOPIC_NUM
- print "min topic number", TopicMining.MIN_TOPIC_NUM
- result_file = output
- mail_globals.logger.debug("save output file: %s" % result_file)
- feature_names = mail_globals.feature.get_feature_names(field='body', lang=lang_cn)
- # topic similarity
- topic_matrix = mail_globals.topic.topic_matrix(lang=lang_cn)
- sim_topic = check_topic_duplicate(topic_matrix, top_k=top_k, lang=lang_cn)
- pd.DataFrame(sim_topic).to_csv(result_file)
- # topic number
- for topic_num in xrange(TopicMining.MAX_TOPIC_NUM, TopicMining.MIN_TOPIC_NUM, -1):
- hits, hits_val, topic_weight = mail.topic_hits(topic_num=topic_num, top_k=top_k, lang=lang_cn)
- with open(result_file, 'a+') as writer:
- writer.write("\n\ntopic number: %d\n" % topic_num)
- for idx in xrange(hits.shape[0]):
- trim_hits = np.trim_zeros(hits_val[idx, :], 'b')
- hits_name = [feature_names[col_indics] for col_indics in hits[idx, :len(trim_hits)]]
- writer.write("topic %d, weight=%.7f, %s \n" % (idx, topic_weight[idx], ', '.join(hits_name)))
- writer.write("topic %d, weight=%.7f, %s \n" % (idx, topic_weight[idx],
- ', '.join(map(lambda val: "%.6f" % val, trim_hits))))
- best_topic_num = mail_globals.topic.metric_topic_num()
- mail_globals.logger.debug("output:%s, best topic num: %d" % (output, best_topic_num))
- def build_topic_model(date_zone, file_format='pkl'):
- import warnings
- warnings.simplefilter("error", pd.core.common.SettingWithCopyWarning)
- path_prefix = mail_globals.data_dir + "/2015-12-%s_token." + file_format
- st, ed = map(int, date_zone.split('-'))
- mail_files = [path_prefix % str(i).zfill(2) for i in range(st, ed + 1)]
- for mail_file in mail_files:
- # load mails
- mail_sets = mail.MailSets()
- mail_sets.load(mail_file, file_format=file_format)
- mail_sets = mail.ignore_duplicate_mails(mail_sets)
- mail_sets = mail.remove_mail_signature(mail_sets)
- mail_sets = mail.mail_parallel_preprocess_v2(mail_sets)
- mail_sets_cn = mail.filter_mails_by_lang(mail_sets, lang=lang_cn)
- mail.build_topic_model(mail_sets_cn, lang=lang_cn, drop_dup=False, spam_filter=True, remove_signature=False)
- mail_sets_en = mail.filter_mails_by_lang(mail_sets, lang=lang_en)
- mail.build_topic_model(mail_sets_en, lang=lang_en, drop_dup=False, spam_filter=True, remove_signature=False)
- mail_globals.topic.dump()
- def urllib_request():
- import simplejson as json
- import urllib2
- url_path = "http://127.0.0.1:27027/keywords?k=10"
- item_stream = {
- "text": u"战略是谋划全局、决定长远的策略。全面建成小康社会、加快推进社会主义现代化、实现中华民族伟大复兴,都需要从战略高度审时度势地提出了治国理政的新理念新思想新举措,并进行系统的战略谋划和战略布局。2012年9月1日习近平在中央党校2012年秋季学期开学典礼上指出:“我们要全面建成小康社会和实现社会主义现代化,有许许多多重大问题需要进行战略谋划。凡是涉及我国经济、政治、文化、社会、生态、外交、国防和党的建设的全局性的重大问题,都需要从战略上进行思考、研究和筹谋;凡是涉及改革发展稳定工作中的各种重大问题,姚明,易建联,科比,也都需要从战略上拿出治本之策"}
- item_stream = json.dumps(item_stream)
- request_obj = urllib2.Request(url=url_path, data=item_stream)
- request_obj.add_header('Content-Type', 'application/json; charset=utf-8')
- # request_obj.add_header('Content-Length', len(item_stream))
- result = json.loads(urllib2.urlopen(request_obj).read())
- for item in result['keywords']['words']:
- print item
- if __name__ == "__main__":
- mail.init()
- build_topic_model('7-7', file_format='pkl')
- # prepare_basic_model("7-31", file_format='pkl')
- # prepare_data("21-31", file_format='pkl')
- # topic_analysis("7-13", top_k=10)