/mail_reply_parser.py
Python | 511 lines | 482 code | 17 blank | 12 comment | 22 complexity | bdd1b894032b7c58a3e70e226aedceb3 MD5 | raw file
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- # author: yaojian-xy <yaojian-xy@360.cn>
- import re
- import sys
- import hashlib
- import time
- import simplejson as json
- import numpy as np
- from sklearn.externals.joblib import Parallel, delayed
- from mail_entity import Mail
- from mail_global import MailGlobal, TimeoutException, time_limit
- from mail_preprocess import advanced_parse_time, parse_address, parse_mail_body
- reload(sys)
- sys.setdefaultencoding("utf-8")
- class MailReplyParser:
- KEY_FROM, KEY_TO, KEY_TIME, KEY_CC, KEY_SUBJ = 'from', 'to', 'time', 'cc', 'subj'
- HEADER_MAP = {u'发件人': KEY_FROM, u'寄件者': KEY_FROM, u'From': KEY_FROM,
- u'发送时间': KEY_TIME, u'寄件日期': KEY_TIME, u'Date': KEY_TIME, u'Sent': KEY_TIME,
- u'收件人': KEY_TO, u'收件者': KEY_TO, u'To': KEY_TO,
- u'抄送': KEY_CC, u'抄 送': KEY_CC, u'抄送人': KEY_CC, u'副本': KEY_CC, u'Cc': KEY_CC,
- u'主题': KEY_SUBJ, u'主 题': KEY_SUBJ, u'主旨': KEY_SUBJ, u'Subject': KEY_SUBJ,
- }
- HEADER_REGEX = re.compile(u'^[ ]*[>\*]{,3}[ ]*\*?(%s)[::]{1}(.+)' % '|'.join(HEADER_MAP.keys()))
- Multi_HEADER_REGEX = re.compile(u'^[ ]*[>\*]{,3}[ ]*\*?(%s)[::]{1}.+' % '|'.join(HEADER_MAP.keys()), re.MULTILINE)
- SUBJECT_REGEX = re.compile(u'^[ ]*(回复|答复|回覆|re|转发|Fw)[::]{1}.+', re.IGNORECASE)
- HR_LINE_REGEX = re.compile(u'^[ ]*[>\*]{,2}[-]{2,}[ ]*(原始邮件|Original Message)[ ]*[-]{2,}.*', re.IGNORECASE)
- ADDR_RM_REG = re.compile(u'\"|mailto:')
- ADDR_REG = re.compile(u'^[>|\]]([^<]*)[<|\[](.*)')
- FROM_TO, FROM_CC = 0, 1
- REPLY_MAIL, RELAY_MAIL = 'Reply', 'Forward'
- EXTRACT_FLAG_NORMAL = 0
- EXTRACT_FLAG_LACK_INFO = 1
- EXTRACT_FLAG_NO_HEADER = 2
- HEADER_END_KEY_SET = set([u'发件人', u'寄件者', u'From'])
- HEADER_START_KEY_SET = set([u'主题', u'主 题', u'主旨', u'Subject'])
- REPLY_STATUS_BODY = 0
- REPLY_STATUS_HEADER = 1
- REPLY_STATUS_HEADER_END = 2
- REG_FILTER_CHAR = re.compile(u"[ \r\n]")
- def __init__(self):
- pass
- @staticmethod
- def extract_reply_fragment(mail_body):
- """
- :param mail_body_lines:
- :return: mail body fragments
- """
- # preprocess mail_body
- mail_body = mail_body.replace('\r\n', '\n')
- mail_body = re.sub('[\n]{2,}', '\n', mail_body)
- # extract reply message
- mail_body_lines = mail_body.split('\n')
- flag = MailReplyParser.EXTRACT_FLAG_NORMAL
- reply_status = MailReplyParser.REPLY_STATUS_BODY
- reply_header_past_indics = None
- body_fragments = []
- # mail_body_lines.reverse() # traverse from end to begin
- body_fragment = {'body': {'ed': len(mail_body_lines) - 1, 'st': len(mail_body_lines) - 1}}
- found_header = False
- for idx in xrange(len(mail_body_lines) - 1, -1, -1):
- line = mail_body_lines[idx]
- header_pair = MailReplyParser.HEADER_REGEX.findall(line)
- if len(header_pair) == 1: # header part
- found_header = True
- header_nm, head_val = header_pair[0]
- if reply_status == MailReplyParser.REPLY_STATUS_BODY: # header part start
- if header_nm not in MailReplyParser.HEADER_START_KEY_SET:
- continue
- reply_status = MailReplyParser.REPLY_STATUS_HEADER
- reply_header_past_indics = idx
- elif reply_status == MailReplyParser.REPLY_STATUS_HEADER:
- for merge_idx in xrange(idx + 1, reply_header_past_indics):
- head_val += mail_body_lines[merge_idx]
- reply_header_past_indics = idx
- if header_nm in MailReplyParser.HEADER_END_KEY_SET:
- reply_status = MailReplyParser.REPLY_STATUS_HEADER_END
- body_fragment[MailReplyParser.HEADER_MAP[header_nm]] = head_val
- else: # text part
- if reply_status == MailReplyParser.REPLY_STATUS_HEADER:
- continue
- elif reply_status == MailReplyParser.REPLY_STATUS_HEADER_END: # header part end
- # append fragment
- if body_fragment['body']['st'] <= body_fragment['body']['ed'] \
- and all(key in body_fragment for key in ('subj', 'body', 'from')): #
- body_fragments.append(body_fragment)
- else:
- MailGlobal.logger.warn("reply msg extraction, lack info: %s" % str(body_fragment.keys()))
- flag = MailReplyParser.EXTRACT_FLAG_LACK_INFO
- body_fragment = {'body': {'ed': idx, 'st': idx}}
- reply_status = MailReplyParser.REPLY_STATUS_BODY
- if MailReplyParser.HR_LINE_REGEX.match(line): # split line
- body_fragment['body']['ed'] = idx - 1
- else:
- body_fragment['body']['st'] = idx
- if not found_header: # not found any mail header
- flag = MailReplyParser.EXTRACT_FLAG_NO_HEADER
- body_fragments.append(body_fragment) # append last fragment with only key(body)
- # mail_body_lines.reverse() # reverse to original body lines
- return mail_body_lines, body_fragments, flag
- @staticmethod
- def extract_addr(addr, alias_map, log_queue=None, desc=""):
- """
- :param addr:
- :param alias_map:
- :return: addr list
- """
- addr = MailReplyParser.ADDR_RM_REG.sub('', addr)
- # find email delim
- prob_delim = [u';', u",", u',']
- delim = ';'
- max_delim_cnt = 0
- for dlch in prob_delim:
- cnt = addr.count(dlch)
- if cnt > max_delim_cnt:
- delim = dlch
- max_delim_cnt = cnt
- # extract address
- addr_list = []
- for addr_unit in addr.split(delim):
- addr_unit = addr_unit.strip()
- addr_parsed = parse_address(addr_unit, desc=desc, log_queue=log_queue)
- if addr_parsed != "":
- addr_list.append(addr_parsed)
- if 'n' in addr_parsed and addr_parsed['n'] not in alias_map:
- alias_map[addr_parsed['n']] = addr_parsed['a']
- else:
- alias = addr_unit
- if alias in alias_map:
- addr_list.append({'a': alias_map[alias], 'n': alias})
- log_queue.pop()
- map(lambda x: x.setdefault('flag', 0), addr_list)
- return addr_list
- @staticmethod
- def trim(raw_text):
- return raw_text.strip(' \r\n')
- @staticmethod
- def alias_collect(x, alias_map):
- if type(x) is dict and 'n' in x:
- alias_map[x['n']] = x['a']
- @staticmethod
- def add_edge(net, addr1, addr2, itype, enhance=0.5):
- if (addr1, addr2) not in net:
- net[(addr1, addr2)] = [0, 0]
- net[(addr1, addr2)][itype] += 1
- # enhance reverse edge
- # if (addr2, addr1) not in net:
- # net[(addr2, addr1)] = [0, 0]
- # net[addr2, addr1][itype] += enhance
- @staticmethod
- def parse_fragment_addr(body_fragments, mail, log_queue=None):
- """
- :param body_fragments:
- :return: extract fragment body
- """
- # get alias map
- alias_map = dict()
- MailReplyParser.alias_collect(mail['from'], alias_map)
- map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['to'])
- map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['cc'])
- # extract address
- new_body_fragments = []
- for i, frag in enumerate(body_fragments):
- log_queue.append("")
- if 'from' not in frag or ('to' not in frag and 'cc' not in frag):
- continue
- addr_from = MailReplyParser.extract_addr(frag['from'], alias_map, desc="from", log_queue=log_queue)
- if len(addr_from) != 1:
- continue
- frag['from'] = addr_from[0]
- if 'to' in frag:
- frag['to'] = MailReplyParser.extract_addr(frag['to'], alias_map, desc="to", log_queue=log_queue)
- else:
- frag['to'] = []
- if 'cc' in frag:
- frag['cc'] = MailReplyParser.extract_addr(frag['cc'], alias_map, desc="cc", log_queue=log_queue)
- else:
- frag['cc'] = []
- if len(frag['to']) == 0 and len(frag['cc']) == 0:
- continue
- new_body_fragments.append(frag)
- return new_body_fragments
- @staticmethod
- def extract_interactive(body_fragments, mail, log_queue=None):
- """
- :param body_fragments:
- :return: extract interactive body
- """
- # get alias map
- alias_map = dict()
- MailReplyParser.alias_collect(mail['from'], alias_map)
- map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['to'])
- map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['cc'])
- # extract address
- inter_net = {}
- for i, frag in enumerate(body_fragments):
- log_queue.append("")
- if 'from' not in frag or ('to' not in frag and 'cc' not in frag):
- continue
- addr_from = MailReplyParser.extract_addr(frag['from'], alias_map, desc="from", log_queue=log_queue)
- if len(addr_from) != 1:
- continue
- addr_from = addr_from[0]['a']
- if 'to' in frag:
- addr_to = MailReplyParser.extract_addr(frag['to'], alias_map, desc="to", log_queue=log_queue)
- map(lambda x: MailReplyParser.add_edge(inter_net, addr_from, x['a'], MailReplyParser.FROM_TO), addr_to)
- if 'cc' in frag:
- addr_cc = MailReplyParser.extract_addr(frag['cc'], alias_map, desc="cc", log_queue=log_queue)
- map(lambda x: MailReplyParser.add_edge(inter_net, addr_from, x['a'], MailReplyParser.FROM_CC), addr_cc)
- return inter_net
- @staticmethod
- def parse_reply(mail, msg_type='all', log_queue=None):
- """
- parse reply message with,
- :param mail:
- :param msg_type: [all / last], 'all' means all text messages, 'last' means last text message
- :return: (mail reply message and mail interactive relationship)
- """
- if not MailReplyParser.is_reply(mail): # if not a reply message
- return None, None
- mail_body_lines, body_fragments, _ = MailReplyParser.extract_reply_fragment(mail['body'])
- if msg_type == 'all':
- inner_msg = ''
- # get all text message
- for frag in body_fragments[::-1]:
- inner_msg += '\n'.join(mail_body_lines[frag['body']['st']: (frag['body']['ed'] + 1)]) + '\n'
- else:
- # get last reply message
- frag = body_fragments[-1]
- inner_msg = '\n'.join(mail_body_lines[frag['body']['st']: (frag['body']['ed'] + 1)])
- # extract interactive part
- inter_net = MailReplyParser.extract_interactive(body_fragments, mail, log_queue=log_queue)
- return inner_msg, inter_net
- @staticmethod
- def category(mail):
- """
- check mail category [reply, or relay]
- :param mail:
- :return: reply mail
- """
- kw = MailReplyParser.SUBJECT_REGEX.findall(mail['subject'])
- if len(kw) == 0:
- return MailReplyParser.REPLY_MAIL
- if re.match(u'转发|Fw', kw[0], re.IGNORECASE):
- return MailReplyParser.RELAY_MAIL
- else:
- return MailReplyParser.REPLY_MAIL
- @staticmethod
- def is_reply(mail):
- """
- check whether mail is a replied mail
- :param mail:
- :return: True or False
- """
- # check subject
- if MailReplyParser.SUBJECT_REGEX.match(mail['subj']):
- return True
- # check body
- header_match = MailReplyParser.Multi_HEADER_REGEX.findall(mail['body'])
- if len(set(header_match)) >= 4: # 4 means (from, to, time, subject)
- return True
- return False
- def batch_reply_parser(mail_datas, n_jobs=MailGlobal.etc_map['n_jobs']):
- # n_jobs = (len(mail_datas) / 5000 + 1)
- # n_jobs = MailGlobal.etc_map['n_jobs'] if n_jobs > MailGlobal.etc_map['n_jobs'] else n_jobs
- mail_map = dict()
- mail_links = []
- reply_count = np.array([0] * 6)
- extract_info = {"has_time": 0.0}
- verbose = 1 if MailGlobal.DEBUG_MODEL else 0
- result = Parallel(n_jobs=n_jobs, verbose=verbose)(
- delayed(single_reply_parse)(index, mail) for index, mail in enumerate(mail_datas))
- for _, item in result:
- if 'map' in item:
- for k, v in item['map'].items():
- mail_map.setdefault(k, [])
- mail_map[k].extend(v)
- if 'link' in item:
- mail_links.extend(item['link'])
- if 'count' in item:
- reply_count += item['count']
- if "proto" not in item and "time" in item:
- extract_info['has_time'] += 1
- MailGlobal.logger.info("extract reply msg mail: %d, including reply number: %d" % tuple(reply_count[0:2]))
- MailGlobal.logger.info("unable to extract reply mail: %d, including lacking info mail=%d, "
- "no header mail=%d, and (messy code) mail=%d" % tuple(reply_count[2:6]))
- MailGlobal.logger.info("extract reply msg mail: time field freq=%d, ratio=%.6f"
- % (reply_count[0], extract_info['has_time'] / (reply_count[0] + 0.1)))
- # save datas
- if MailGlobal.DEBUG_MODEL:
- mail_map_count = 0
- for _, v in mail_map.items():
- mail_map_count += len(v)
- MailGlobal.logger.debug("mail map number: %d" % mail_map_count)
- MailGlobal.logger.debug("mail link number: %d" % len(mail_links))
- mail_reply_datas = []
- for md5, mails in mail_map.items():
- for mail in mails:
- mail_reply_datas.append(mail)
- return mail_reply_datas, mail_links
- def single_reply_parse(index, json_mail, time_it=10):
- def write_log(_mail, _queue):
- has_msg = False
- for msg in _queue:
- if len(msg) > 0:
- has_msg = True
- break
- if has_msg:
- MailGlobal.logger._writer("#"*20 + _mail["_id"] + " BEGIN " + "#"*20)
- step = 0
- if _queue[0] != "": # original error
- MailGlobal.logger._writer("# original")
- while step < len(_queue) and _queue[step] != "": # log original error
- MailGlobal.logger._writer(_queue[step])
- step += 1
- floor_no = 0
- floor_zone = False
- while step < len(_queue):
- msg = _queue[step]
- if msg == "":
- floor_zone = False
- else:
- if not floor_zone:
- floor_no += 1
- floor_zone = True
- MailGlobal.logger._writer("# floor %d" % floor_no)
- MailGlobal.logger._writer(msg)
- step += 1
- MailGlobal.logger._writer("#"*20 + " END " + "#"*20)
- def parse_mail_time(mail):
- if not 'time' in mail:
- mail["time"] = ""
- if type(mail['time']) is not float:
- mtime = advanced_parse_time(mail['time'])
- if mtime is not MailGlobal.INVALID_TIME:
- mail['time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(mtime))
- else:
- mail['time'] = mail['time'].strip()
- else:
- mail['time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(mail['time']))
- def parse_messy_text(mail):
- mail['subj'] = mail['subj'].strip(" \r\n")
- mail['body'] = mail['body'].strip(" \r\n")
- mail['subject'] = mail.pop('subj')
- mail['plain'] = mail.pop('body')
- if 'attach' in mail:
- mail['attachment'] = mail.pop('attach')
- def add_mail_map(mail, mail_type="original"):
- # parse time
- parse_mail_time(mail)
- # parse messy code
- parse_messy_text(mail)
- if "_id" not in mail:
- mail["_id"] = hashlib.md5(json.dumps(mail)).hexdigest()
- raw_text = MailReplyParser.REG_FILTER_CHAR.sub('', "%s %s" % (mail['subject'], mail['plain'])).encode('utf8')
- mail["md5_sp"] = hashlib.md5(raw_text).hexdigest()
- # trim mail time
- if MailGlobal.time_standard.match(mail["time"]):
- raw_time = mail["time"][: -3]
- else:
- raw_time = mail["time"]
- mail["md5_ftsp"] = hashlib.md5("%s %s %s" % (mail["from"]['a'].encode("utf8"), raw_time, raw_text)).hexdigest()
- if "mid" not in mail:
- mail["mid"] = mail["md5_ftsp"]
- key = mail["md5_ftsp"]
- if key not in mail_map:
- mail_map[key] = [mail]
- else:
- mail_map[key].append(mail)
- mail['type'] = mail_type
- mail_map = dict()
- try:
- with time_limit(time_it):
- log_queue = []
- mail = Mail(json_mail, log_queue=log_queue, extra=True)
- # parse mail body
- mail["body"] = parse_mail_body(mail["body"])
- mail_links = []
- reply_parser = MailGlobal.reply_parser
- # reply count (6 elements)
- reply_count = np.array([0] * 6)
- # MailGlobal.logger.debug("subject: %s" % mail['subj'])
- # get mail md5 txt
- mail['subj'] = mail['subj'].strip()
- mail_body = re.sub('[\n]{2,}', '\n', mail['body'])
- mail['body'] = MailReplyParser.trim(mail_body)
- # extract mail reply
- if reply_parser.is_reply(mail): # reply mail
- mail_body_lines, body_fragments, flag = MailReplyParser.extract_reply_fragment(mail['body'])
- # mail['flag'] = flag
- # handle fragment
- if len(body_fragments) == 1: # not extract any fragment
- add_mail_map(mail.m)
- reply_count[2] += 1
- MailGlobal.logger.debug("unable to extract reply msg")
- if flag == MailReplyParser.EXTRACT_FLAG_LACK_INFO:
- reply_count[3] += 1
- elif flag == MailReplyParser.EXTRACT_FLAG_NO_HEADER:
- reply_count[4] += 1
- if mail['plain'].find(u'\ufffd') >= 0:
- reply_count[5] += 1
- return index, {'map': mail_map, 'count': reply_count}
- # parse address and filter some fragement
- parsed_body_fragments = MailReplyParser.parse_fragment_addr(body_fragments, mail, log_queue=log_queue)
- # append original
- parsed_body_fragments.append(body_fragments[-1])
- body_fragments = parsed_body_fragments
- # modify last body fragment
- mail['body'] = body_fragments[-1]['body']
- body_fragments[-1] = mail.m
- MailGlobal.logger.debug("extract reply msg number: %d" % (len(body_fragments) - 1))
- reply_count[0] += 1
- reply_count[1] += len(body_fragments) - 1
- for idx, frag in enumerate(body_fragments):
- frag['subj'] = frag['subj'].strip()
- frag['body'] = '\n'.join(mail_body_lines[frag['body']['st']: (frag['body']['ed'] + 1)])
- frag['body'] = MailReplyParser.trim(frag['body'])
- mail_type = "original"
- if idx < (len(body_fragments) - 1):
- mail_type = "extracted"
- add_mail_map(frag, mail_type=mail_type)
- if idx >= 1:
- pre_frag = body_fragments[idx - 1]
- link_type = MailReplyParser.category(frag)
- mail_links.append({'node1': frag['mid'], 'node2': pre_frag['mid'], 'type': link_type})
- # write to log
- write_log(mail, log_queue)
- return index, {'map': mail_map, 'link': mail_links, 'count': reply_count}
- else: # not reply mail
- MailGlobal.logger.debug("mail is not reply mail")
- add_mail_map(mail.m, mail_type="original")
- # write to log
- write_log(mail, log_queue)
- return index, {'map': mail_map}
- except TimeoutException:
- MailGlobal.logger.warn("reply parser exec time out, index=%s, text=%s" % (index, mail.m['body'].encode('utf8')))
- add_mail_map(mail.m)
- return index, {'map': mail_map}
- def test():
- mail_body = u"数据已修改确认过,详见附件~\r\n\r\n\r\n\r\n成为信息安全领域最信赖的、技术领先的、\r\n国际知名的方案、产品和服务提供商\r\n------------------------------------------------------\r\n人力资源部 冯萍\r\n网神信息技术(北京)股份有限公司 \r\n地址:北京市海淀区上地开拓路7号先锋大厦二段一层(100085)\r\n手机:18618105615\r\n固话:010-62972892\r\n传真:010-62972896(请注明收件人)\r\n客户热线:400-610-8220\r\n邮箱:fengping@legendsec.com\r\n网址:www.legendsec.com\r\n \r\n发件人: 王冠华\r\n发送时间: 2016-03-04 11:55\r\n收件人: fengping@legendsec.com; guohx@legendsec.com\r\n抄送: guomc (guomc@legendsec.com); 郭建华 (guojh@legendsec.com)\r\n主题: HC预算表格更新\r\n \r\n冯萍,宏霞好,\r\n \r\n请把安服与安管中心,供应链,品牌推广,合作发展部,财务和人力行政这几个部门的情况填入附件表格\r\n \r\n不懂的地方可以问问建华怎么填,另你那边涉及到多种用工形式,把各用工形式拆开\r\n \r\n另建华,慕初,宏霞和冯萍你们三个做完了,统一都给慕初一版,慕初统一看一下我们不要拉下网神一个人\r\n \r\n今天下班前需要反馈我\r\n \r\n有任何问题可以随时联络我\r\n \r\n说的不全的慕初和建华直接补充\r\n \r\n \r\n王冠华\r\n人力资源部\r\n \r\n奇虎360\r\n手机:15727302652\r\n电话:010-52448453\r\n邮件:wangguanhua@360.cn\r\n地址:北京市朝阳区酒仙桥路6号院(电子城•国际电子总部)2号楼B座5层\r\n \r\n"
- log_queue = []
- mail = {'from': {'a': "wangguanhua@360.cn"},
- "to": [{'n': u'王冠华', 'a': "wangguanhua@360.cn"}], "cc": []}
- mail_body_lines, body_fragments, flag = MailReplyParser.extract_reply_fragment(mail_body)
- body_fragments = MailReplyParser.parse_fragment_addr(body_fragments, mail, log_queue=log_queue)
- print '\n'.join(log_queue)
- print '\n'*3
- print "flag = ", flag
- for frag in body_fragments:
- print frag
- # parse address
- from mail_preprocess import advanced_parse_time
- s = u"2015年12月4日 15:13"
- print advanced_parse_time(s)
- if __name__ == "__main__":
- # test()
- # json_mails = json.load(open('reply_test.txt', 'r'))
- json_mails = map(lambda line: json.loads(line.strip()), open('reply_test_2.txt', 'r').readlines())
- MailGlobal.register_instance('reply_parser', MailGlobal.timeit_v2(lambda: MailReplyParser(),
- desc="load reply parser "))
- if len(json_mails) > 0:
- # mail_datas = load_json_mails(json_mails)
- """
- mail_reply_datas (type of list)
- mail_reply_links (type of list)
- """
- mail_reply_datas, mail_reply_links = batch_reply_parser(json_mails, n_jobs=1)
- print '\n'.join(map(str, mail_reply_datas[:10]))
- print "\n\n"
- print '\n'.join(map(str, mail_reply_links[:10]))