/mail_global.py
Python | 295 lines | 268 code | 8 blank | 19 comment | 0 complexity | 2a8f5d288bc742fa3d2ff96e494b9989 MD5 | raw file
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- # author: yaojian-xy <yaojian-xy@360.cn>
- import re
- import os
- import time
- import sys
- import signal
- import functools
- from contextlib import contextmanager
- class MailGlobalObject:
- def __init__(self, etc_file=None):
- # register macro variable
- self.__register_macro_define()
- self.__register_regex()
- self.__register_etc(etc_file)
- # logger
- self.logger = Logger(self.etc_map['log_dir'], debug=self.DEBUG_MODEL)
- # register instance to factory
- # self.instance = {}
- # def __getattr__(self, key):
- # if key in self.instance:
- # return self.instance[key]
- # else:
- # raise AttributeError
- def __register_macro_define(self):
- self.DEBUG_MODEL = False
- self.RUN_MODEL = "online" # "online" or "offline"
- # mail language
- self.MAIL_LANG_CN = 'cn'
- self.MAIL_LANG_EN = 'en'
- # mail protocal
- self.MAIL_PROTO_SMTP = 'smtp'
- self.MAIL_PROTO_POP = 'pop'
- self.MAIL_PROTO_IMAP = 'imap'
- self.MAIL_PROTO_WEBMAIL = 'webmail'
- # macro for mail address node
- self.IN_DEGREE_AS_TO = 2
- self.IN_DEGREE_AS_CC = 3
- self.OUT_DEGREE = 4
- # macro for mail network edge
- self.NET_FROM_2_TO = 'from_to'
- self.NET_FROM_2_CC = 'from_cc'
- self.NET_ATTACH = 'attach'
- # macro for token
- self.MAX_ENG_WORD_LENGTH = 20
- self.INVALID_TIME = -1
- self.INNER_FEATURES = [u'工作', u'生活', u'情感', u'学习']
- def __register_regex(self):
- self.email_reg = re.compile('[^@|\s]+@[^@]+\.[^@|\s]+')
- self.email_reg_2 = re.compile(u"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-_]+(?:\.[a-zA-Z0-9-.]+)+")
- self.email_addr_ignore = re.compile(u'undisclosed-recipients|Mail Delivery Subsystem|Mail Delivery System',
- flags=re.IGNORECASE)
- self.email_html_br_tag = re.compile(u"<br.*?>|<p.*?>", flags=re.IGNORECASE | re.DOTALL)
- self.email_html_tag = re.compile(u'<style.*?style>|<script.*?script>|<.*?>|[\r\t]{2,}| ', flags=re.IGNORECASE | re.DOTALL)
- self.url_reg = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
- self.unicode_reg = re.compile(
- u'(?:[\u0000-\u0009|\u000b-\u001f|\ufff1-\uffff|\u2007]|<U\+200E>|(?:&#\d{5};)|\\xa0)')
- # time regex
- """
- ex: ["Mon, 07 Dec 2015 22:35:35 +0800", "07 Dec 2015 22:35:35", "07 Dec 2015 22:35",
- "2015-12-11 14:10", "2015年12月12日 20:10", "2015年12月7日 15:42",
- "Wednesday, December 2, 2015 at 8:01 PM"
- ]
- """
- self.time_standard = re.compile(u'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$')
- self.time_reg_map = {u'凌晨': 'AM', u'上午': 'AM', u'中午': 'PM', u'下午': 'PM', u'晚上': 'PM',
- u'十一月': 11, u'十二月': 12,
- u"一月": 1, u'二月': 2, u'三月': 3, u'四月': 4, u'五月': 5,
- u'六月': 6, u'七月': 7, u'八月': 8, u'九月': 9, u'十月': 10,
- u'年': '-', u'月': '-', u'日': ' ',
- }
- self.time_reg_sub = re.compile("|".join([re.escape(k) for k in self.time_reg_map.keys()]), re.M)
- self.time_reg_sub_1 = re.compile(u"(?:星期|周)[一|二|三|四|五|六|七|日|1-7]|\([^\)]*\)|[\*]|<.*?>")
- self.time_reg_sub_2 = re.compile(u'(.*)(PM|AM)\s*(\d{1,2}\:\d{1,2})') # reverse "PM 15:06" to "15:06 PM"
- self.time_reg_sub_3 = re.compile(u'(\d{4}-\d{2}-\d{2})(\d{2}\:\d{2})') # 2015-12-1014:52
- # "09 Dec 2015 14:54:30 +0300", ""
- self.time_reg_1 = re.compile(
- u'[\s\*]*(?:\d{1,2}\s[a-zA-Z]{3}\s\d{4}\s+\d{1,2}\:\s?\d{1,2}(?:\:\s?\d{1,2})?\s?(?:[+-0]{1}\d{4})?)')
- # self.time_reg_2 = re.compile(u'(?:(\d{4})[年-](\d{1,2})[月-](\d{1,2})日?\s+(\d{1,2}:\d{1,2}(?:\:\d{1,2})?))')
- # base64 set
- self.base64_set = set('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\r\n')
- def register_instance(self, name, instance):
- # self.instance[name] = instance
- if name == "dict":
- self.dict = instance
- elif name == "reply_parser":
- self.reply_parser = instance
- elif name == "tokenizer":
- self.tokenizer = instance
- elif name == "keywords_extract":
- self.keywords_extract = instance
- elif name == "feature":
- self.feature = instance
- elif name == "word2vec":
- self.word2vec = instance
- elif name == "wordcount":
- self.word_count = instance
- elif name == "wordcloud":
- self.word_cloud = instance
- elif name == "cluster":
- self.cluster = instance
- elif name == "topic":
- self.topic = instance
- elif name == "spam_clf":
- self.spam_clf = instance
- elif name == "graphnet":
- self.graphnet = instance
- elif name == "portrait":
- self.portrait = instance
- elif name == "update":
- self.update = instance
- elif name == "similar":
- self.similar = instance
- def __register_etc(self, etc_file=None):
- if etc_file is None:
- self.data_dir = './data'
- self.etc_map = {'log_dir': "%s/log" % self.data_dir,
- 'user_dict': "%s/user_dict.txt" % self.data_dir,
- 'stop_words': '%s/stop_words.txt' % self.data_dir,
- 'font_file': "%s/songti.TTF" % self.data_dir,
- 'topic_model': "%s/topic.pkl" % self.data_dir,
- 'cluster_model': "%s/cluster.pkl" % self.data_dir,
- 'feature_model': "%s/feature.bin" % self.data_dir,
- 'word2vec_model': "%s/word2vec_model" % self.data_dir,
- 'wordcnt_file': "%s/word_counter.bz2" % self.data_dir,
- 'net_model': "%s/net.model" % self.data_dir,
- 'portrait_file': "%s/portrait.pkl" % self.data_dir,
- 'spam_model': "%s/spam_classifier.pkl" % self.data_dir,
- 'spam_rule_file': "%s/Chinese_rules.json" % self.data_dir,
- 'pull_time_flag': "%s/pull_time.flag" % self.data_dir,
- 'tokenizer': 'jieba', # ['pynlpir', 'jieba']
- # 'cn_ratio': 0.001, # used to recognize chinese mail
- 'interval': 'week', # [day, week, month]
- 'net_lib': 'graph-tool', # ["graph-tool", "networkx"]
- 'n_jobs': 10,
- 'mail_offline_server': 'http://localhost:27028',
- 'mail_online_server': 'http://localhost:27027',
- 'mail_bak_dir': './mails_bak',
- 'mail_db_file': "%s/mail_db.bin" % self.data_dir,
- 'mail_db_profile': "%s/mail_profile.bin" % self.data_dir,
- }
- else: # read etc file
- pass
- def set_time_flag(self, time_stamp):
- open(self.etc_map['pull_time_flag'], 'wb').write(str(time_stamp))
- def get_time_flag(self):
- return open(self.etc_map['pull_time_flag'], 'rb').read()
- def get_date_flag(self):
- time_flag = int(self.get_time_flag())
- return time.strftime('%Y-%m-%d', time.localtime(time_flag))
- @staticmethod
- def timeit(call_func):
- """
- usage: add "@timeit" before function declaration
- :param call_func:
- :return:
- """
- @functools.wraps(call_func)
- def call_back(*args, **kwargs):
- start_time = time.time()
- call_res = call_func(*args, **kwargs)
- elapsed_time = time.time() - start_time
- if 'desc' not in kwargs:
- kwargs['desc'] = None
- MailGlobal.logger.debug(
- 'function [{}], desc=[{}] finished in {} second'.format(call_func.__name__, kwargs['desc'],
- int(elapsed_time)))
- return call_res
- return call_back
- @staticmethod
- def timeit_v2(call_func, desc=None):
- """
- usage: timeit_v2(lambda: func(params))
- :param call_func:
- :return:
- """
- start_time = time.time()
- call_res = call_func()
- elapsed_time = time.time() - start_time
- MailGlobal.logger.info('execute=[{}] finished in {} ms'.format(desc, int(elapsed_time * 1000)))
- return call_res
- @staticmethod
- def timeit_v3(call_func, desc=None):
- return call_func
- @staticmethod
- def max_scale(array):
- max_arr = float(max(array))
- return [x/max_arr for x in array]
- class Logger:
- def __init__(self, log_dir, debug=False):
- self._log_dir = log_dir
- if not os.path.exists(self._log_dir):
- os.mkdir(self._log_dir)
- self._debug = debug
- self.DATE_FORMAT = "%Y-%m-%d"
- self.DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
- self._log_date = self._curdate()
- self._logfile = "%s/%s.log" % (self._log_dir, self._log_date)
- self._logger = open(self._logfile, 'a+')
- self.queue = []
- self.switch = True
- def _curdate(self):
- return time.strftime(self.DATE_FORMAT, time.localtime())
- def _curdatetime(self):
- return time.strftime(self.DATETIME_FORMAT, time.localtime())
- def _switch_log(self):
- if self._log_date != self._curdate() and self.switch: # create new logfile
- # close old logfile
- self._logger.close()
- # make new log file
- self._log_date = self._curdate()
- self._logfile = "%s/%s.log" % (self._log_dir, self._log_date)
- self._logger = open(self._logfile, 'a+')
- def _writer(self, msg):
- self._switch_log()
- # maybe locker is needed here
- self._logger.write("%s\n" % msg)
- def debug(self, msg):
- if self._debug:
- msg = "%s [DEBUG] %s" % (self._curdatetime(), msg)
- self._writer(msg)
- def info(self, msg):
- msg = "%s [INFO] %s" % (self._curdatetime(), msg)
- print msg
- self._writer(msg)
- def warn(self, msg, log_queue=None):
- msg = "%s [WARN] %s" % (self._curdatetime(), msg)
- if log_queue is None:
- self._writer(msg)
- else:
- log_queue.append(msg)
- def error(self, msg, to_exit=False):
- msg = "%s [ERROR] %s" % (self._curdatetime(), msg)
- print msg
- self._writer(msg)
- if to_exit:
- sys.exit(-1)
- class TimeoutException(Exception): pass
- @contextmanager
- def time_limit(seconds):
- def signal_handler(signum, frame):
- raise TimeoutException
- signal.signal(signal.SIGALRM, signal_handler)
- signal.alarm(seconds)
- try:
- yield
- finally:
- signal.alarm(0)
- MailGlobal = MailGlobalObject()