mail_reply_parser.py

/mail_reply_parser.py

https://gitlab.com/yaojian/RenjuAI
Python | 511 lines | 482 code | 17 blank | 12 comment | 22 complexity | bdd1b894032b7c58a3e70e226aedceb3 MD5 | raw file

#!/usr/bin/python
#  -*- coding: utf-8 -*-
# author: yaojian-xy <yaojian-xy@360.cn>

import re
import sys
import hashlib
import time
import simplejson as json

import numpy as np
from sklearn.externals.joblib import Parallel, delayed

from mail_entity import Mail
from mail_global import MailGlobal, TimeoutException, time_limit
from mail_preprocess import advanced_parse_time, parse_address, parse_mail_body

reload(sys)
sys.setdefaultencoding("utf-8")


class MailReplyParser:
    KEY_FROM, KEY_TO, KEY_TIME, KEY_CC, KEY_SUBJ = 'from', 'to', 'time', 'cc', 'subj'
    HEADER_MAP = {u'发件人': KEY_FROM, u'寄件者': KEY_FROM, u'From': KEY_FROM,
                  u'发送时间': KEY_TIME, u'寄件日期': KEY_TIME, u'Date': KEY_TIME, u'Sent': KEY_TIME,
                  u'收件人': KEY_TO, u'收件者': KEY_TO, u'To': KEY_TO,
                  u'抄送': KEY_CC, u'抄　送': KEY_CC, u'抄送人': KEY_CC, u'副本': KEY_CC, u'Cc': KEY_CC,
                  u'主题': KEY_SUBJ, u'主　题': KEY_SUBJ, u'主旨': KEY_SUBJ, u'Subject': KEY_SUBJ,
                  }
    HEADER_REGEX = re.compile(u'^[ ]*[>\*]{,3}[ ]*\*?(%s)[：:]{1}(.+)' % '|'.join(HEADER_MAP.keys()))
    Multi_HEADER_REGEX = re.compile(u'^[ ]*[>\*]{,3}[ ]*\*?(%s)[：:]{1}.+' % '|'.join(HEADER_MAP.keys()), re.MULTILINE)
    SUBJECT_REGEX = re.compile(u'^[ ]*(回复|答复|回覆|re|转发|Fw)[：:]{1}.+', re.IGNORECASE)

    HR_LINE_REGEX = re.compile(u'^[ ]*[>\*]{,2}[-]{2,}[ ]*(原始邮件|Original Message)[ ]*[-]{2,}.*', re.IGNORECASE)

    ADDR_RM_REG = re.compile(u'\"|mailto:')
    ADDR_REG = re.compile(u'^[>|\]]([^<]*)[<|\[](.*)')

    FROM_TO, FROM_CC = 0, 1
    REPLY_MAIL, RELAY_MAIL = 'Reply', 'Forward'

    EXTRACT_FLAG_NORMAL = 0
    EXTRACT_FLAG_LACK_INFO = 1
    EXTRACT_FLAG_NO_HEADER = 2

    HEADER_END_KEY_SET = set([u'发件人', u'寄件者', u'From'])
    HEADER_START_KEY_SET = set([u'主题', u'主　题', u'主旨', u'Subject'])

    REPLY_STATUS_BODY = 0
    REPLY_STATUS_HEADER = 1
    REPLY_STATUS_HEADER_END = 2

    REG_FILTER_CHAR = re.compile(u"[ \r\n]")

    def __init__(self):
        pass

    @staticmethod
    def extract_reply_fragment(mail_body):
        """
        :param mail_body_lines:
        :return: mail body fragments
        """
        # preprocess mail_body
        mail_body = mail_body.replace('\r\n', '\n')
        mail_body = re.sub('[\n]{2,}', '\n', mail_body)
        # extract reply message
        mail_body_lines = mail_body.split('\n')
        flag = MailReplyParser.EXTRACT_FLAG_NORMAL
        reply_status = MailReplyParser.REPLY_STATUS_BODY
        reply_header_past_indics = None
        body_fragments = []
        # mail_body_lines.reverse()  # traverse from end to begin
        body_fragment = {'body': {'ed': len(mail_body_lines) - 1, 'st': len(mail_body_lines) - 1}}
        found_header = False
        for idx in xrange(len(mail_body_lines) - 1, -1, -1):
            line = mail_body_lines[idx]
            header_pair = MailReplyParser.HEADER_REGEX.findall(line)
            if len(header_pair) == 1:  # header part
                found_header = True
                header_nm, head_val = header_pair[0]
                if reply_status == MailReplyParser.REPLY_STATUS_BODY:  # header part start
                    if header_nm not in MailReplyParser.HEADER_START_KEY_SET:
                        continue
                    reply_status = MailReplyParser.REPLY_STATUS_HEADER
                    reply_header_past_indics = idx
                elif reply_status == MailReplyParser.REPLY_STATUS_HEADER:
                    for merge_idx in xrange(idx + 1, reply_header_past_indics):
                        head_val += mail_body_lines[merge_idx]
                    reply_header_past_indics = idx
                    if header_nm in MailReplyParser.HEADER_END_KEY_SET:
                        reply_status = MailReplyParser.REPLY_STATUS_HEADER_END
                body_fragment[MailReplyParser.HEADER_MAP[header_nm]] = head_val
            else:  # text part
                if reply_status == MailReplyParser.REPLY_STATUS_HEADER:
                    continue
                elif reply_status == MailReplyParser.REPLY_STATUS_HEADER_END:  # header part end
                    # append  fragment
                    if body_fragment['body']['st'] <= body_fragment['body']['ed'] \
                            and all(key in body_fragment for key in ('subj', 'body', 'from')):  #
                        body_fragments.append(body_fragment)
                    else:
                        MailGlobal.logger.warn("reply msg extraction, lack info: %s" % str(body_fragment.keys()))
                        flag = MailReplyParser.EXTRACT_FLAG_LACK_INFO
                    body_fragment = {'body': {'ed': idx, 'st': idx}}
                    reply_status = MailReplyParser.REPLY_STATUS_BODY
                if MailReplyParser.HR_LINE_REGEX.match(line):  # split line
                    body_fragment['body']['ed'] = idx - 1
                else:
                    body_fragment['body']['st'] = idx
        if not found_header:  # not found any mail header
            flag = MailReplyParser.EXTRACT_FLAG_NO_HEADER
        body_fragments.append(body_fragment)  # append last fragment with only key(body)
        # mail_body_lines.reverse()  # reverse to original body lines
        return mail_body_lines, body_fragments, flag

    @staticmethod
    def extract_addr(addr, alias_map, log_queue=None, desc=""):
        """
        :param addr:
        :param alias_map:
        :return: addr list
        """
        addr = MailReplyParser.ADDR_RM_REG.sub('', addr)
        # find email delim
        prob_delim = [u';', u",", u'，']
        delim = ';'
        max_delim_cnt = 0
        for dlch in prob_delim:
            cnt = addr.count(dlch)
            if cnt > max_delim_cnt:
                delim = dlch
                max_delim_cnt = cnt
        # extract address
        addr_list = []
        for addr_unit in addr.split(delim):
            addr_unit = addr_unit.strip()
            addr_parsed = parse_address(addr_unit, desc=desc, log_queue=log_queue)
            if addr_parsed != "":
                addr_list.append(addr_parsed)
                if 'n' in addr_parsed and addr_parsed['n'] not in alias_map:
                    alias_map[addr_parsed['n']] = addr_parsed['a']
            else:
                alias = addr_unit
                if alias in alias_map:
                    addr_list.append({'a': alias_map[alias], 'n': alias})
                    log_queue.pop()
        map(lambda x: x.setdefault('flag', 0), addr_list)
        return addr_list

    @staticmethod
    def trim(raw_text):
        return raw_text.strip(' \r\n')

    @staticmethod
    def alias_collect(x, alias_map):
        if type(x) is dict and 'n' in x:
            alias_map[x['n']] = x['a']

    @staticmethod
    def add_edge(net, addr1, addr2, itype, enhance=0.5):
        if (addr1, addr2) not in net:
            net[(addr1, addr2)] = [0, 0]
        net[(addr1, addr2)][itype] += 1
        # enhance reverse edge
        # if (addr2, addr1) not in net:
        #     net[(addr2, addr1)] = [0, 0]
        # net[addr2, addr1][itype] += enhance

    @staticmethod
    def parse_fragment_addr(body_fragments, mail, log_queue=None):
        """
        :param body_fragments:
        :return: extract fragment body
        """
        # get alias map
        alias_map = dict()
        MailReplyParser.alias_collect(mail['from'], alias_map)
        map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['to'])
        map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['cc'])
        # extract address
        new_body_fragments = []
        for i, frag in enumerate(body_fragments):
            log_queue.append("")
            if 'from' not in frag or ('to' not in frag and 'cc' not in frag):
                continue
            addr_from = MailReplyParser.extract_addr(frag['from'], alias_map, desc="from", log_queue=log_queue)
            if len(addr_from) != 1:
                continue
            frag['from'] = addr_from[0]
            if 'to' in frag:
                frag['to'] = MailReplyParser.extract_addr(frag['to'], alias_map, desc="to", log_queue=log_queue)
            else:
                frag['to'] = []
            if 'cc' in frag:
                frag['cc'] = MailReplyParser.extract_addr(frag['cc'], alias_map, desc="cc", log_queue=log_queue)
            else:
                frag['cc'] = []
            if len(frag['to']) == 0 and len(frag['cc']) == 0:
                continue
            new_body_fragments.append(frag)
        return new_body_fragments

    @staticmethod
    def extract_interactive(body_fragments, mail, log_queue=None):
        """
        :param body_fragments:
        :return: extract interactive body
        """

        # get alias map
        alias_map = dict()
        MailReplyParser.alias_collect(mail['from'], alias_map)
        map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['to'])
        map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['cc'])
        # extract address
        inter_net = {}
        for i, frag in enumerate(body_fragments):
            log_queue.append("")
            if 'from' not in frag or ('to' not in frag and 'cc' not in frag):
                continue
            addr_from = MailReplyParser.extract_addr(frag['from'], alias_map, desc="from", log_queue=log_queue)
            if len(addr_from) != 1:
                continue
            addr_from = addr_from[0]['a']
            if 'to' in frag:
                addr_to = MailReplyParser.extract_addr(frag['to'], alias_map, desc="to", log_queue=log_queue)
                map(lambda x: MailReplyParser.add_edge(inter_net, addr_from, x['a'], MailReplyParser.FROM_TO), addr_to)
            if 'cc' in frag:
                addr_cc = MailReplyParser.extract_addr(frag['cc'], alias_map, desc="cc", log_queue=log_queue)
                map(lambda x: MailReplyParser.add_edge(inter_net, addr_from, x['a'], MailReplyParser.FROM_CC), addr_cc)
        return inter_net

    @staticmethod
    def parse_reply(mail, msg_type='all', log_queue=None):
        """
            parse reply message with,
        :param mail:
        :param msg_type: [all / last], 'all' means all text messages, 'last' means last text message
        :return: (mail reply message and mail interactive relationship)
        """
        if not MailReplyParser.is_reply(mail):  # if not a reply message
            return None, None
        mail_body_lines, body_fragments, _ = MailReplyParser.extract_reply_fragment(mail['body'])
        if msg_type == 'all':
            inner_msg = ''
            #  get all text message
            for frag in body_fragments[::-1]:
                inner_msg += '\n'.join(mail_body_lines[frag['body']['st']: (frag['body']['ed'] + 1)]) + '\n'
        else:
            #  get last reply message
            frag = body_fragments[-1]
            inner_msg = '\n'.join(mail_body_lines[frag['body']['st']: (frag['body']['ed'] + 1)])
        # extract interactive part
        inter_net = MailReplyParser.extract_interactive(body_fragments, mail, log_queue=log_queue)
        return inner_msg, inter_net

    @staticmethod
    def category(mail):
        """
            check mail category [reply, or relay]
        :param mail:
        :return: reply mail
        """
        kw = MailReplyParser.SUBJECT_REGEX.findall(mail['subject'])
        if len(kw) == 0:
            return MailReplyParser.REPLY_MAIL
        if re.match(u'转发|Fw', kw[0], re.IGNORECASE):
            return MailReplyParser.RELAY_MAIL
        else:
            return MailReplyParser.REPLY_MAIL

    @staticmethod
    def is_reply(mail):
        """
            check whether mail is a replied mail
        :param mail:
        :return: True or False
        """
        # check subject
        if MailReplyParser.SUBJECT_REGEX.match(mail['subj']):
            return True
        # check body
        header_match = MailReplyParser.Multi_HEADER_REGEX.findall(mail['body'])
        if len(set(header_match)) >= 4:  # 4 means (from, to, time, subject)
            return True
        return False


def batch_reply_parser(mail_datas, n_jobs=MailGlobal.etc_map['n_jobs']):
    # n_jobs = (len(mail_datas) / 5000 + 1)
    # n_jobs = MailGlobal.etc_map['n_jobs'] if n_jobs > MailGlobal.etc_map['n_jobs'] else n_jobs
    mail_map = dict()
    mail_links = []
    reply_count = np.array([0] * 6)
    extract_info = {"has_time": 0.0}
    verbose = 1 if MailGlobal.DEBUG_MODEL else 0
    result = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(single_reply_parse)(index, mail) for index, mail in enumerate(mail_datas))
    for _, item in result:
        if 'map' in item:
            for k, v in item['map'].items():
                mail_map.setdefault(k, [])
                mail_map[k].extend(v)
        if 'link' in item:
            mail_links.extend(item['link'])
        if 'count' in item:
            reply_count += item['count']
        if "proto" not in item and "time" in item:
            extract_info['has_time'] += 1
    MailGlobal.logger.info("extract reply msg mail: %d,  including reply number: %d" % tuple(reply_count[0:2]))
    MailGlobal.logger.info("unable to extract reply mail: %d, including lacking info mail=%d, "
                           "no header mail=%d, and (messy code) mail=%d" % tuple(reply_count[2:6]))
    MailGlobal.logger.info("extract reply msg mail: time field freq=%d, ratio=%.6f"
                           % (reply_count[0], extract_info['has_time'] / (reply_count[0] + 0.1)))
    # save datas
    if MailGlobal.DEBUG_MODEL:
        mail_map_count = 0
        for _, v in mail_map.items():
            mail_map_count += len(v)
        MailGlobal.logger.debug("mail map number: %d" % mail_map_count)
        MailGlobal.logger.debug("mail link number: %d" % len(mail_links))
    mail_reply_datas = []
    for md5, mails in mail_map.items():
        for mail in mails:
            mail_reply_datas.append(mail)
    return mail_reply_datas, mail_links


def single_reply_parse(index, json_mail, time_it=10):
    def write_log(_mail, _queue):
        has_msg = False
        for msg in _queue:
            if len(msg) > 0:
                has_msg = True
                break
        if has_msg:
            MailGlobal.logger._writer("#"*20 + _mail["_id"] + " BEGIN " + "#"*20)
            step = 0
            if _queue[0] != "":  # original error
                MailGlobal.logger._writer("# original")
            while step < len(_queue) and _queue[step] != "":  # log original error
                MailGlobal.logger._writer(_queue[step])
                step += 1
            floor_no = 0
            floor_zone = False
            while step < len(_queue):
                msg = _queue[step]
                if msg == "":
                    floor_zone = False
                else:
                    if not floor_zone:
                        floor_no += 1
                        floor_zone = True
                        MailGlobal.logger._writer("# floor %d" % floor_no)
                    MailGlobal.logger._writer(msg)
                step += 1
            MailGlobal.logger._writer("#"*20 + " END " + "#"*20)

    def parse_mail_time(mail):
        if not 'time' in mail:
            mail["time"] = ""
        if type(mail['time']) is not float:
            mtime = advanced_parse_time(mail['time'])
            if mtime is not MailGlobal.INVALID_TIME:
                mail['time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(mtime))
            else:
                mail['time'] = mail['time'].strip()
        else:
            mail['time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(mail['time']))

    def parse_messy_text(mail):
        mail['subj'] = mail['subj'].strip(" \r\n")
        mail['body'] = mail['body'].strip(" \r\n")
        mail['subject'] = mail.pop('subj')
        mail['plain'] = mail.pop('body')
        if 'attach' in mail:
            mail['attachment'] = mail.pop('attach')

    def add_mail_map(mail, mail_type="original"):
        # parse time
        parse_mail_time(mail)
        # parse messy code
        parse_messy_text(mail)
        if "_id" not in mail:
            mail["_id"] = hashlib.md5(json.dumps(mail)).hexdigest()
        raw_text = MailReplyParser.REG_FILTER_CHAR.sub('', "%s %s" % (mail['subject'], mail['plain'])).encode('utf8')
        mail["md5_sp"] = hashlib.md5(raw_text).hexdigest()
        # trim mail time
        if MailGlobal.time_standard.match(mail["time"]):
            raw_time = mail["time"][: -3]
        else:
            raw_time = mail["time"]
        mail["md5_ftsp"] = hashlib.md5("%s %s %s" % (mail["from"]['a'].encode("utf8"), raw_time, raw_text)).hexdigest()
        if "mid" not in mail:
            mail["mid"] = mail["md5_ftsp"]
        key = mail["md5_ftsp"]
        if key not in mail_map:
            mail_map[key] = [mail]
        else:
            mail_map[key].append(mail)
        mail['type'] = mail_type

    mail_map = dict()
    try:
        with time_limit(time_it):
            log_queue = []
            mail = Mail(json_mail, log_queue=log_queue, extra=True)
            # parse mail body
            mail["body"] = parse_mail_body(mail["body"])
            mail_links = []
            reply_parser = MailGlobal.reply_parser
            # reply count (6 elements)
            reply_count = np.array([0] * 6)
            # MailGlobal.logger.debug("subject: %s" % mail['subj'])
            # get mail md5 txt
            mail['subj'] = mail['subj'].strip()
            mail_body = re.sub('[\n]{2,}', '\n', mail['body'])
            mail['body'] = MailReplyParser.trim(mail_body)
            # extract mail reply
            if reply_parser.is_reply(mail):  # reply mail
                mail_body_lines, body_fragments, flag = MailReplyParser.extract_reply_fragment(mail['body'])
                # mail['flag'] = flag
                # handle fragment
                if len(body_fragments) == 1:  # not extract any fragment
                    add_mail_map(mail.m)
                    reply_count[2] += 1
                    MailGlobal.logger.debug("unable to extract reply msg")
                    if flag == MailReplyParser.EXTRACT_FLAG_LACK_INFO:
                        reply_count[3] += 1
                    elif flag == MailReplyParser.EXTRACT_FLAG_NO_HEADER:
                        reply_count[4] += 1
                    if mail['plain'].find(u'\ufffd') >= 0:
                        reply_count[5] += 1
                    return index, {'map': mail_map, 'count': reply_count}
                # parse address and filter some fragement
                parsed_body_fragments = MailReplyParser.parse_fragment_addr(body_fragments, mail, log_queue=log_queue)
                # append original
                parsed_body_fragments.append(body_fragments[-1])
                body_fragments = parsed_body_fragments
                # modify last body fragment
                mail['body'] = body_fragments[-1]['body']
                body_fragments[-1] = mail.m

                MailGlobal.logger.debug("extract reply msg number: %d" % (len(body_fragments) - 1))
                reply_count[0] += 1
                reply_count[1] += len(body_fragments) - 1
                for idx, frag in enumerate(body_fragments):
                    frag['subj'] = frag['subj'].strip()
                    frag['body'] = '\n'.join(mail_body_lines[frag['body']['st']: (frag['body']['ed'] + 1)])
                    frag['body'] = MailReplyParser.trim(frag['body'])
                    mail_type = "original"
                    if idx < (len(body_fragments) - 1):
                        mail_type = "extracted"
                    add_mail_map(frag, mail_type=mail_type)
                    if idx >= 1:
                        pre_frag = body_fragments[idx - 1]
                        link_type = MailReplyParser.category(frag)
                        mail_links.append({'node1': frag['mid'], 'node2': pre_frag['mid'], 'type': link_type})
                # write to log
                write_log(mail, log_queue)
                return index, {'map': mail_map, 'link': mail_links, 'count': reply_count}
            else:  # not reply mail
                MailGlobal.logger.debug("mail is not reply mail")
                add_mail_map(mail.m, mail_type="original")
                # write to log
                write_log(mail, log_queue)
                return index, {'map': mail_map}
    except TimeoutException:
        MailGlobal.logger.warn("reply parser exec time out, index=%s, text=%s" % (index, mail.m['body'].encode('utf8')))
        add_mail_map(mail.m)
    return index, {'map': mail_map}


def test():
    mail_body = u"数据已修改确认过，详见附件~\r\n\r\n\r\n\r\n成为信息安全领域最信赖的、技术领先的、\r\n国际知名的方案、产品和服务提供商\r\n------------------------------------------------------\r\n人力资源部 冯萍\r\n网神信息技术（北京）股份有限公司 \r\n地址：北京市海淀区上地开拓路7号先锋大厦二段一层（100085)\r\n手机：18618105615\r\n固话：010-62972892\r\n传真：010-62972896（请注明收件人）\r\n客户热线：400-610-8220\r\n邮箱：fengping@legendsec.com\r\n网址：www.legendsec.com\r\n \r\n发件人： 王冠华\r\n发送时间： 2016-03-04 11:55\r\n收件人： fengping@legendsec.com; guohx@legendsec.com\r\n抄送： guomc (guomc@legendsec.com); 郭建华 (guojh@legendsec.com)\r\n主题： HC预算表格更新\r\n \r\n冯萍,宏霞好，\r\n \r\n请把安服与安管中心，供应链，品牌推广，合作发展部，财务和人力行政这几个部门的情况填入附件表格\r\n \r\n不懂的地方可以问问建华怎么填，另你那边涉及到多种用工形式，把各用工形式拆开\r\n \r\n另建华，慕初，宏霞和冯萍你们三个做完了，统一都给慕初一版，慕初统一看一下我们不要拉下网神一个人\r\n \r\n今天下班前需要反馈我\r\n \r\n有任何问题可以随时联络我\r\n \r\n说的不全的慕初和建华直接补充\r\n \r\n \r\n王冠华\r\n人力资源部\r\n \r\n奇虎360\r\n手机：15727302652\r\n电话：010-52448453\r\n邮件：wangguanhua@360.cn\r\n地址：北京市朝阳区酒仙桥路6号院（电子城•国际电子总部）2号楼B座5层\r\n \r\n"

    log_queue = []
    mail = {'from': {'a': "wangguanhua@360.cn"},
            "to": [{'n': u'王冠华', 'a': "wangguanhua@360.cn"}], "cc": []}
    mail_body_lines, body_fragments, flag = MailReplyParser.extract_reply_fragment(mail_body)
    body_fragments = MailReplyParser.parse_fragment_addr(body_fragments, mail, log_queue=log_queue)

    print '\n'.join(log_queue)
    print '\n'*3
    print "flag = ", flag
    for frag in body_fragments:
        print frag

    # parse address
    from mail_preprocess import advanced_parse_time
    s = u"2015年12月4日 15:13"
    print advanced_parse_time(s)


if __name__ == "__main__":
    # test()
    # json_mails = json.load(open('reply_test.txt', 'r'))
    json_mails = map(lambda line: json.loads(line.strip()), open('reply_test_2.txt', 'r').readlines())
    MailGlobal.register_instance('reply_parser', MailGlobal.timeit_v2(lambda: MailReplyParser(),
                                                                      desc="load reply parser "))
    if len(json_mails) > 0:
        # mail_datas = load_json_mails(json_mails)
        """
            mail_reply_datas (type of list)
            mail_reply_links (type of list)
        """
        mail_reply_datas, mail_reply_links = batch_reply_parser(json_mails, n_jobs=1)
        print '\n'.join(map(str, mail_reply_datas[:10]))
        print "\n\n"
        print '\n'.join(map(str, mail_reply_links[:10]))