PageRenderTime 43ms CodeModel.GetById 7ms RepoModel.GetById 0ms app.codeStats 0ms

/mail_global.py

https://gitlab.com/yaojian/RenjuAI
Python | 295 lines | 268 code | 8 blank | 19 comment | 0 complexity | 2a8f5d288bc742fa3d2ff96e494b9989 MD5 | raw file
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # author: yaojian-xy <yaojian-xy@360.cn>
  4. import re
  5. import os
  6. import time
  7. import sys
  8. import signal
  9. import functools
  10. from contextlib import contextmanager
  11. class MailGlobalObject:
  12. def __init__(self, etc_file=None):
  13. # register macro variable
  14. self.__register_macro_define()
  15. self.__register_regex()
  16. self.__register_etc(etc_file)
  17. # logger
  18. self.logger = Logger(self.etc_map['log_dir'], debug=self.DEBUG_MODEL)
  19. # register instance to factory
  20. # self.instance = {}
  21. # def __getattr__(self, key):
  22. # if key in self.instance:
  23. # return self.instance[key]
  24. # else:
  25. # raise AttributeError
  26. def __register_macro_define(self):
  27. self.DEBUG_MODEL = False
  28. self.RUN_MODEL = "online" # "online" or "offline"
  29. # mail language
  30. self.MAIL_LANG_CN = 'cn'
  31. self.MAIL_LANG_EN = 'en'
  32. # mail protocal
  33. self.MAIL_PROTO_SMTP = 'smtp'
  34. self.MAIL_PROTO_POP = 'pop'
  35. self.MAIL_PROTO_IMAP = 'imap'
  36. self.MAIL_PROTO_WEBMAIL = 'webmail'
  37. # macro for mail address node
  38. self.IN_DEGREE_AS_TO = 2
  39. self.IN_DEGREE_AS_CC = 3
  40. self.OUT_DEGREE = 4
  41. # macro for mail network edge
  42. self.NET_FROM_2_TO = 'from_to'
  43. self.NET_FROM_2_CC = 'from_cc'
  44. self.NET_ATTACH = 'attach'
  45. # macro for token
  46. self.MAX_ENG_WORD_LENGTH = 20
  47. self.INVALID_TIME = -1
  48. self.INNER_FEATURES = [u'工作', u'生活', u'情感', u'学习']
  49. def __register_regex(self):
  50. self.email_reg = re.compile('[^@|\s]+@[^@]+\.[^@|\s]+')
  51. self.email_reg_2 = re.compile(u"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-_]+(?:\.[a-zA-Z0-9-.]+)+")
  52. self.email_addr_ignore = re.compile(u'undisclosed-recipients|Mail Delivery Subsystem|Mail Delivery System',
  53. flags=re.IGNORECASE)
  54. self.email_html_br_tag = re.compile(u"<br.*?>|<p.*?>", flags=re.IGNORECASE | re.DOTALL)
  55. self.email_html_tag = re.compile(u'<style.*?style>|<script.*?script>|<.*?>|[\r\t]{2,}|&nbsp;', flags=re.IGNORECASE | re.DOTALL)
  56. self.url_reg = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
  57. self.unicode_reg = re.compile(
  58. u'(?:[\u0000-\u0009|\u000b-\u001f|\ufff1-\uffff|\u2007]|<U\+200E>|(?:&#\d{5};)|\\xa0)')
  59. # time regex
  60. """
  61. ex: ["Mon, 07 Dec 2015 22:35:35 +0800", "07 Dec 2015 22:35:35", "07 Dec 2015 22:35",
  62. "2015-12-11 14:10", "2015年12月12日 20:10", "2015年12月7日 15:42",
  63. "Wednesday, December 2, 2015 at 8:01 PM"
  64. ]
  65. """
  66. self.time_standard = re.compile(u'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$')
  67. self.time_reg_map = {u'凌晨': 'AM', u'上午': 'AM', u'中午': 'PM', u'下午': 'PM', u'晚上': 'PM',
  68. u'十一月': 11, u'十二月': 12,
  69. u"一月": 1, u'二月': 2, u'三月': 3, u'四月': 4, u'五月': 5,
  70. u'六月': 6, u'七月': 7, u'八月': 8, u'九月': 9, u'十月': 10,
  71. u'年': '-', u'月': '-', u'日': ' ',
  72. }
  73. self.time_reg_sub = re.compile("|".join([re.escape(k) for k in self.time_reg_map.keys()]), re.M)
  74. self.time_reg_sub_1 = re.compile(u"(?:星期|周)[一|二|三|四|五|六|七|日|1-7]|\([^\)]*\)|[\*]|<.*?>")
  75. self.time_reg_sub_2 = re.compile(u'(.*)(PM|AM)\s*(\d{1,2}\:\d{1,2})') # reverse "PM 15:06" to "15:06 PM"
  76. self.time_reg_sub_3 = re.compile(u'(\d{4}-\d{2}-\d{2})(\d{2}\:\d{2})') # 2015-12-1014:52
  77. # "09 Dec 2015 14:54:30 +0300", ""
  78. self.time_reg_1 = re.compile(
  79. u'[\s\*]*(?:\d{1,2}\s[a-zA-Z]{3}\s\d{4}\s+\d{1,2}\:\s?\d{1,2}(?:\:\s?\d{1,2})?\s?(?:[+-0]{1}\d{4})?)')
  80. # self.time_reg_2 = re.compile(u'(?:(\d{4})[年-](\d{1,2})[月-](\d{1,2})日?\s+(\d{1,2}:\d{1,2}(?:\:\d{1,2})?))')
  81. # base64 set
  82. self.base64_set = set('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\r\n')
  83. def register_instance(self, name, instance):
  84. # self.instance[name] = instance
  85. if name == "dict":
  86. self.dict = instance
  87. elif name == "reply_parser":
  88. self.reply_parser = instance
  89. elif name == "tokenizer":
  90. self.tokenizer = instance
  91. elif name == "keywords_extract":
  92. self.keywords_extract = instance
  93. elif name == "feature":
  94. self.feature = instance
  95. elif name == "word2vec":
  96. self.word2vec = instance
  97. elif name == "wordcount":
  98. self.word_count = instance
  99. elif name == "wordcloud":
  100. self.word_cloud = instance
  101. elif name == "cluster":
  102. self.cluster = instance
  103. elif name == "topic":
  104. self.topic = instance
  105. elif name == "spam_clf":
  106. self.spam_clf = instance
  107. elif name == "graphnet":
  108. self.graphnet = instance
  109. elif name == "portrait":
  110. self.portrait = instance
  111. elif name == "update":
  112. self.update = instance
  113. elif name == "similar":
  114. self.similar = instance
  115. def __register_etc(self, etc_file=None):
  116. if etc_file is None:
  117. self.data_dir = './data'
  118. self.etc_map = {'log_dir': "%s/log" % self.data_dir,
  119. 'user_dict': "%s/user_dict.txt" % self.data_dir,
  120. 'stop_words': '%s/stop_words.txt' % self.data_dir,
  121. 'font_file': "%s/songti.TTF" % self.data_dir,
  122. 'topic_model': "%s/topic.pkl" % self.data_dir,
  123. 'cluster_model': "%s/cluster.pkl" % self.data_dir,
  124. 'feature_model': "%s/feature.bin" % self.data_dir,
  125. 'word2vec_model': "%s/word2vec_model" % self.data_dir,
  126. 'wordcnt_file': "%s/word_counter.bz2" % self.data_dir,
  127. 'net_model': "%s/net.model" % self.data_dir,
  128. 'portrait_file': "%s/portrait.pkl" % self.data_dir,
  129. 'spam_model': "%s/spam_classifier.pkl" % self.data_dir,
  130. 'spam_rule_file': "%s/Chinese_rules.json" % self.data_dir,
  131. 'pull_time_flag': "%s/pull_time.flag" % self.data_dir,
  132. 'tokenizer': 'jieba', # ['pynlpir', 'jieba']
  133. # 'cn_ratio': 0.001, # used to recognize chinese mail
  134. 'interval': 'week', # [day, week, month]
  135. 'net_lib': 'graph-tool', # ["graph-tool", "networkx"]
  136. 'n_jobs': 10,
  137. 'mail_offline_server': 'http://localhost:27028',
  138. 'mail_online_server': 'http://localhost:27027',
  139. 'mail_bak_dir': './mails_bak',
  140. 'mail_db_file': "%s/mail_db.bin" % self.data_dir,
  141. 'mail_db_profile': "%s/mail_profile.bin" % self.data_dir,
  142. }
  143. else: # read etc file
  144. pass
  145. def set_time_flag(self, time_stamp):
  146. open(self.etc_map['pull_time_flag'], 'wb').write(str(time_stamp))
  147. def get_time_flag(self):
  148. return open(self.etc_map['pull_time_flag'], 'rb').read()
  149. def get_date_flag(self):
  150. time_flag = int(self.get_time_flag())
  151. return time.strftime('%Y-%m-%d', time.localtime(time_flag))
  152. @staticmethod
  153. def timeit(call_func):
  154. """
  155. usage: add "@timeit" before function declaration
  156. :param call_func:
  157. :return:
  158. """
  159. @functools.wraps(call_func)
  160. def call_back(*args, **kwargs):
  161. start_time = time.time()
  162. call_res = call_func(*args, **kwargs)
  163. elapsed_time = time.time() - start_time
  164. if 'desc' not in kwargs:
  165. kwargs['desc'] = None
  166. MailGlobal.logger.debug(
  167. 'function [{}], desc=[{}] finished in {} second'.format(call_func.__name__, kwargs['desc'],
  168. int(elapsed_time)))
  169. return call_res
  170. return call_back
  171. @staticmethod
  172. def timeit_v2(call_func, desc=None):
  173. """
  174. usage: timeit_v2(lambda: func(params))
  175. :param call_func:
  176. :return:
  177. """
  178. start_time = time.time()
  179. call_res = call_func()
  180. elapsed_time = time.time() - start_time
  181. MailGlobal.logger.info('execute=[{}] finished in {} ms'.format(desc, int(elapsed_time * 1000)))
  182. return call_res
  183. @staticmethod
  184. def timeit_v3(call_func, desc=None):
  185. return call_func
  186. @staticmethod
  187. def max_scale(array):
  188. max_arr = float(max(array))
  189. return [x/max_arr for x in array]
  190. class Logger:
  191. def __init__(self, log_dir, debug=False):
  192. self._log_dir = log_dir
  193. if not os.path.exists(self._log_dir):
  194. os.mkdir(self._log_dir)
  195. self._debug = debug
  196. self.DATE_FORMAT = "%Y-%m-%d"
  197. self.DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
  198. self._log_date = self._curdate()
  199. self._logfile = "%s/%s.log" % (self._log_dir, self._log_date)
  200. self._logger = open(self._logfile, 'a+')
  201. self.queue = []
  202. self.switch = True
  203. def _curdate(self):
  204. return time.strftime(self.DATE_FORMAT, time.localtime())
  205. def _curdatetime(self):
  206. return time.strftime(self.DATETIME_FORMAT, time.localtime())
  207. def _switch_log(self):
  208. if self._log_date != self._curdate() and self.switch: # create new logfile
  209. # close old logfile
  210. self._logger.close()
  211. # make new log file
  212. self._log_date = self._curdate()
  213. self._logfile = "%s/%s.log" % (self._log_dir, self._log_date)
  214. self._logger = open(self._logfile, 'a+')
  215. def _writer(self, msg):
  216. self._switch_log()
  217. # maybe locker is needed here
  218. self._logger.write("%s\n" % msg)
  219. def debug(self, msg):
  220. if self._debug:
  221. msg = "%s [DEBUG] %s" % (self._curdatetime(), msg)
  222. self._writer(msg)
  223. def info(self, msg):
  224. msg = "%s [INFO] %s" % (self._curdatetime(), msg)
  225. print msg
  226. self._writer(msg)
  227. def warn(self, msg, log_queue=None):
  228. msg = "%s [WARN] %s" % (self._curdatetime(), msg)
  229. if log_queue is None:
  230. self._writer(msg)
  231. else:
  232. log_queue.append(msg)
  233. def error(self, msg, to_exit=False):
  234. msg = "%s [ERROR] %s" % (self._curdatetime(), msg)
  235. print msg
  236. self._writer(msg)
  237. if to_exit:
  238. sys.exit(-1)
  239. class TimeoutException(Exception): pass
  240. @contextmanager
  241. def time_limit(seconds):
  242. def signal_handler(signum, frame):
  243. raise TimeoutException
  244. signal.signal(signal.SIGALRM, signal_handler)
  245. signal.alarm(seconds)
  246. try:
  247. yield
  248. finally:
  249. signal.alarm(0)
  250. MailGlobal = MailGlobalObject()