PageRenderTime 32ms CodeModel.GetById 10ms RepoModel.GetById 0ms app.codeStats 0ms

/demo.py

https://gitlab.com/yaojian/RenjuAI
Python | 177 lines | 170 code | 5 blank | 2 comment | 0 complexity | 4fddfca5e4113bb91178a91cbf76fc0f MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import sys
  4. from datetime import datetime
  5. import numpy as np
  6. import pandas as pd
  7. import mail
  8. reload(sys)
  9. sys.setdefaultencoding("utf-8")
  10. mail_globals = mail.mail_globals
  11. global_dir = mail_globals.data_dir
  12. lang_cn, lang_en = mail_globals.MAIL_LANG_CN, mail_globals.MAIL_LANG_EN
  13. def prepare_data(date_zone, file_format='pkl'):
  14. """
  15. :param date_zone: "7-20"
  16. :param file_format: ["csv", "json", "bz2", "pkl"]
  17. :return:
  18. """
  19. path_prefix = global_dir + "/2015-12-%s"
  20. st, ed = map(int, date_zone.split('-'))
  21. mail_files = [path_prefix % str(i).zfill(2) for i in range(st, ed + 1)]
  22. for mail_file in mail_files:
  23. saved_file = '%s_token.%s' % (mail_file, file_format)
  24. # load mails
  25. mail_datas = mail.mail_io_load_file(mail_file)
  26. mail_datas = mail.parallel_preprocess(mail_datas)
  27. mail_sets = mail.MailSets(mail_datas)
  28. del mail_datas
  29. print "save file: ", saved_file
  30. mail_sets.dump(saved_file, file_format=file_format)
  31. def prepare_basic_model(date_zone, file_format='pkl'):
  32. from mail_preprocess import parallel_preprocess_v2
  33. path_prefix = global_dir + "/2015-12-%s_token." + file_format
  34. st, ed = map(int, date_zone.split('-'))
  35. mail_files = [path_prefix % str(i).zfill(2) for i in range(st, ed + 1)]
  36. mail_sets_frames = []
  37. for mail_file in mail_files:
  38. mail_globals.logger.debug("load mail file: %s" % mail_file)
  39. if not os.path.exists(mail_file):
  40. mail_globals.logger.warn("not found file: %s" % mail_file)
  41. continue
  42. # load mails
  43. mail_sets = mail.MailSets()
  44. mail_sets.load(mail_file, file_format=file_format)
  45. if mail_sets.length() == 0:
  46. continue
  47. mail_globals.logger.debug("ignore duplicate mails=%s" % mail_file)
  48. mail_sets = mail.ignore_duplicate_mails(mail_sets)
  49. # remove signature
  50. mail_sets = mail.remove_mail_signature(mail_sets)
  51. mail_sets = mail.mail_parallel_preprocess_v2(mail_sets)
  52. mail_sets_frames.append(mail_sets.df)
  53. # del mail_sets
  54. mail_sets = mail.MailSets(pd.concat(mail_sets_frames))
  55. del mail_sets_frames
  56. # build mail tfidf feature model
  57. mail_sets_cn = mail.filter_mails_by_lang(mail_sets, lang=lang_cn)
  58. mail_globals.timeit_v2(lambda: mail.build_feature_model(mail_sets_cn, lang=lang_cn, drop_dup=False),
  59. desc="build feature model, mails=%d, lang=%s" % (mail_sets_cn.length(), lang_cn))
  60. # del mail_sets_cn
  61. mail_sets_en = mail.filter_mails_by_lang(mail_sets, lang=lang_en)
  62. mail_globals.timeit_v2(lambda: mail.build_feature_model(mail_sets_en, lang=lang_en, drop_dup=False),
  63. desc="build feature model, mails=%d, lang=%s" % (mail_sets_en.length(), lang_en))
  64. # del mail_sets_en
  65. mail_globals.feature.dump()
  66. # build word2vec model
  67. mail_globals.timeit_v2(lambda: mail.build_word2vec_model(mail_sets),
  68. desc="build word2vec model, mails=%d" % mail_sets.length())
  69. mail_globals.word2vec.dump()
  70. def topic_analysis(date_zone, top_k=20, file_format='pkl'):
  71. path_prefix = mail_globals.data_dir + "/2015-12-%s_token." + file_format
  72. st, ed = map(int, date_zone.split('-'))
  73. mail_files = [path_prefix % str(i).zfill(2) for i in range(st, ed + 1)]
  74. for mail_file in mail_files:
  75. result_file = '%s_topic_analysis.csv' % mail_file
  76. # load mails
  77. mail_sets = mail.MailSets()
  78. mail_sets.load(mail_file, file_format=file_format)
  79. # print "partial fit word2vec"
  80. # mail.build_word2vec_model(mail_sets)
  81. # remove signature
  82. flag_time_1 = datetime.now()
  83. mail_sets = mail.remove_mail_signature(mail_sets)
  84. flag_time_2 = datetime.now()
  85. mail_globals.logger.info(
  86. "remove mail signature performance: mails=%d, time=%d" % (
  87. mail_sets.length(), (flag_time_2 - flag_time_1).seconds))
  88. # partial fit topic
  89. flag_time_1 = datetime.now()
  90. mail.build_topic_model(mail_sets)
  91. flag_time_2 = datetime.now()
  92. mail_globals.logger.info(
  93. "build topic model performance: mails=%d, time=%d" % (
  94. mail_sets.length(), (flag_time_2 - flag_time_1).seconds))
  95. # analysis
  96. sub_topic_analysis(result_file, top_k=top_k)
  97. def sub_topic_analysis(output, top_k=10):
  98. from mail_topic import TopicMining, check_topic_duplicate
  99. print "max topic number", TopicMining.MAX_TOPIC_NUM
  100. print "min topic number", TopicMining.MIN_TOPIC_NUM
  101. result_file = output
  102. mail_globals.logger.debug("save output file: %s" % result_file)
  103. feature_names = mail_globals.feature.get_feature_names(field='body', lang=lang_cn)
  104. # topic similarity
  105. topic_matrix = mail_globals.topic.topic_matrix(lang=lang_cn)
  106. sim_topic = check_topic_duplicate(topic_matrix, top_k=top_k, lang=lang_cn)
  107. pd.DataFrame(sim_topic).to_csv(result_file)
  108. # topic number
  109. for topic_num in xrange(TopicMining.MAX_TOPIC_NUM, TopicMining.MIN_TOPIC_NUM, -1):
  110. hits, hits_val, topic_weight = mail.topic_hits(topic_num=topic_num, top_k=top_k, lang=lang_cn)
  111. with open(result_file, 'a+') as writer:
  112. writer.write("\n\ntopic number: %d\n" % topic_num)
  113. for idx in xrange(hits.shape[0]):
  114. trim_hits = np.trim_zeros(hits_val[idx, :], 'b')
  115. hits_name = [feature_names[col_indics] for col_indics in hits[idx, :len(trim_hits)]]
  116. writer.write("topic %d, weight=%.7f, %s \n" % (idx, topic_weight[idx], ', '.join(hits_name)))
  117. writer.write("topic %d, weight=%.7f, %s \n" % (idx, topic_weight[idx],
  118. ', '.join(map(lambda val: "%.6f" % val, trim_hits))))
  119. best_topic_num = mail_globals.topic.metric_topic_num()
  120. mail_globals.logger.debug("output:%s, best topic num: %d" % (output, best_topic_num))
  121. def build_topic_model(date_zone, file_format='pkl'):
  122. import warnings
  123. warnings.simplefilter("error", pd.core.common.SettingWithCopyWarning)
  124. path_prefix = mail_globals.data_dir + "/2015-12-%s_token." + file_format
  125. st, ed = map(int, date_zone.split('-'))
  126. mail_files = [path_prefix % str(i).zfill(2) for i in range(st, ed + 1)]
  127. for mail_file in mail_files:
  128. # load mails
  129. mail_sets = mail.MailSets()
  130. mail_sets.load(mail_file, file_format=file_format)
  131. mail_sets = mail.ignore_duplicate_mails(mail_sets)
  132. mail_sets = mail.remove_mail_signature(mail_sets)
  133. mail_sets = mail.mail_parallel_preprocess_v2(mail_sets)
  134. mail_sets_cn = mail.filter_mails_by_lang(mail_sets, lang=lang_cn)
  135. mail.build_topic_model(mail_sets_cn, lang=lang_cn, drop_dup=False, spam_filter=True, remove_signature=False)
  136. mail_sets_en = mail.filter_mails_by_lang(mail_sets, lang=lang_en)
  137. mail.build_topic_model(mail_sets_en, lang=lang_en, drop_dup=False, spam_filter=True, remove_signature=False)
  138. mail_globals.topic.dump()
  139. def urllib_request():
  140. import simplejson as json
  141. import urllib2
  142. url_path = "http://127.0.0.1:27027/keywords?k=10"
  143. item_stream = {
  144. "text": u"战略是谋划全局、决定长远的策略。全面建成小康社会、加快推进社会主义现代化、实现中华民族伟大复兴,都需要从战略高度审时度势地提出了治国理政的新理念新思想新举措,并进行系统的战略谋划和战略布局。2012年9月1日习近平在中央党校2012年秋季学期开学典礼上指出:“我们要全面建成小康社会和实现社会主义现代化,有许许多多重大问题需要进行战略谋划。凡是涉及我国经济、政治、文化、社会、生态、外交、国防和党的建设的全局性的重大问题,都需要从战略上进行思考、研究和筹谋;凡是涉及改革发展稳定工作中的各种重大问题,姚明,易建联,科比,也都需要从战略上拿出治本之策"}
  145. item_stream = json.dumps(item_stream)
  146. request_obj = urllib2.Request(url=url_path, data=item_stream)
  147. request_obj.add_header('Content-Type', 'application/json; charset=utf-8')
  148. # request_obj.add_header('Content-Length', len(item_stream))
  149. result = json.loads(urllib2.urlopen(request_obj).read())
  150. for item in result['keywords']['words']:
  151. print item
  152. if __name__ == "__main__":
  153. mail.init()
  154. build_topic_model('7-7', file_format='pkl')
  155. # prepare_basic_model("7-31", file_format='pkl')
  156. # prepare_data("21-31", file_format='pkl')
  157. # topic_analysis("7-13", top_k=10)