/mail_signature.py
Python | 324 lines | 282 code | 20 blank | 22 comment | 1 complexity | 9c99b4f2cbe5adcdc5e1020eeeabcbba MD5 | raw file
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- # author: yaojian-xy <yaojian-xy@360.cn>
- import re
- from sklearn.externals.joblib import Parallel, delayed
- from mail_global import MailGlobal, TimeoutException, time_limit
- class MailSignatureParser:
- """
- extract mail signature, including: person names, address, phone, fax, zipcode and so on
- """
- CHINESE_SURNAME = u"李王张刘陈杨赵黄周吴徐孙胡朱高林何郭马罗梁宋郑谢韩唐冯于董萧程曹袁邓许傅沈曾彭吕\
- 苏卢蒋蔡贾丁魏薛叶阎余潘杜戴夏钟汪田任姜范方石姚谭廖邹熊金陆郝孔白崔康毛邱秦江史\
- 顾侯邵孟龙万段雷钱汤尹黎易常武乔贺赖龚文"
- KEY_EMAIL, KEY_PHONE, KEY_ADDRESS, KEY_ZIP_CODE, KEY_COMPANY = 'email', 'phone', 'address', 'zipcode', 'company'
- KEY_WEBSITE, KEY_PERSON, KEY_OTHER = 'website', 'person', 'other'
- ENTITY_MAP = {KEY_ADDRESS: u"总\s*部|地\s*址|addr?",
- KEY_PHONE: u"电\s*话|手\s*机|手机号码|Tel|座机|Mobile",
- KEY_EMAIL: u"E-?mail",
- KEY_ZIP_CODE: u"邮\s*编|PC\.",
- KEY_PERSON: u"联系人|Contact",
- KEY_WEBSITE: u"网\s*站|website",
- KEY_OTHER: u"Q\s*Q|飞\s*信",
- }
- # ENTITY_MAP = dict((_v, k) for _v in v.split('|') for k, v in ENTITY_MAP.items())
- for k, v in ENTITY_MAP.items():
- ENTITY_MAP[k] = re.compile(u'^[ >\*]*(%s)[::]' % v, re.IGNORECASE)
- # ENTITY_REGEX = re.compile(u'(%s)[::]' % '|'.join(ENTITY_MAP.keys()), re.IGNORECASE)
- # ENTITY_REGEX = re.compile(u'^[ >\*]*(%s)[::]' % '|'.join(ENTITY_MAP.keys()), re.IGNORECASE)
- REG_EMAIL = MailGlobal.email_reg_2
- # ex: http://www.hylandslaw.com or www.hylandslaw.com
- REG_URL = re.compile(u"(?:http[s]?://|www\.){1}(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
- # ex: ["000-000-0000", "000 000 0000", "+86 10 6588 8825", "+86-21-60410082"]
- REG_PHONE = re.compile(
- u"(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}|\d{2}[-\s]?\d{2}[-\s]?\d{4}[-\s]?\d{4})")
- REG_COMPANY = re.compile(u"[:: ]*([\u4e00-\u9fff|\u0020-\u00ff]{2,}公司)")
- REG_ADDRESS = re.compile(u"(?:.+省)?(?:.+市)?.+区(?:.+路)?(?:.+号)?")
- REG_ZIP_CODE = re.compile(u"(?:邮\s*编|PC\.)\s*[::]?\s*\d{6}")
- # REG_PERSON = re.compile(u"(?:(联系人|Contact|)\s*[::]?\s*[%s][\u4e00-\u9fa5]{1,2})\
- # |(?:^[%s][\u4e00-\u9fa5]{1,2}$)" % (CHINESE_SURNAME, CHINESE_SURNAME))
- REG_PERSON = re.compile(u"(^|[\s::]+)[%s]([\u4e00-\u9fa5]{1,2}|\s{1,2}[\u4e00-\u9fa5])($|\s+)" % CHINESE_SURNAME)
- # REG_PERSON = re.compile(u"((?:联系人|Contact)\s*[::][%s][\u4e00-\u9fa5]{1,2}|\
- # (?:联系人|Contact)\s*[::][a-zA-Z]+\s[a-zA-Z]+|\
- # ^[%s][\u4e00-\u9fa5]{1,2}$|^[a-zA-Z]+\s[a-zA-Z]+$)"
- # % (CHINESE_SURNAME, CHINESE_SURNAME))
- REGEX_MAP = {KEY_EMAIL: REG_EMAIL,
- KEY_WEBSITE: REG_URL,
- KEY_PHONE: REG_PHONE,
- KEY_COMPANY: REG_COMPANY,
- KEY_ZIP_CODE: REG_ZIP_CODE,
- KEY_ADDRESS: REG_ADDRESS,
- KEY_PERSON: REG_PERSON,
- }
- SEPARATE_LINE_REGEX = re.compile(u"^\s*(?:-|_|—){4,}\s*$", re.MULTILINE)
- SERIES_EMPTY_LINE_REGEX = re.compile(u"(?:\n *){2,}")
- REPLACE_REGEX = re.compile(' |\"')
- REPLY_REGEX = re.compile(u'^\s*[>\*]{,3}\s*\*?(发件人|寄件者|From)[::]{1}.+', re.MULTILINE)
- REPLY_SEPARATE_REGEX = re.compile(u"^\s*(?:-|—){4,}\s*(?:原始邮件|Original Message)\s*(?:-|—){4,}\s*$",
- re.MULTILINE | re.IGNORECASE)
- IS_SIGNATURE_MAIL = True
- def __init__(self):
- pass
- @staticmethod
- def preprocess(mail_body):
- """
- preprocess
- :param mail_body:
- :return: mail body lines
- """
- body_stream = re.sub(' |\"', '', mail_body)
- body_stream = body_stream.replace('\r\n', '\n')
- body_stream = MailSignatureParser.SERIES_EMPTY_LINE_REGEX.sub('\n\n', body_stream)
- mail_body_lines = body_stream.split('\n')
- return mail_body_lines
- @staticmethod
- def remove_reply_pos(mail_body):
- match_span = MailSignatureParser.REPLY_SEPARATE_REGEX.search(mail_body)
- if match_span is not None:
- return mail_body[: match_span.span()[0]]
- match_span = MailSignatureParser.REPLY_REGEX.search(mail_body)
- if match_span is not None:
- return mail_body[: match_span.span()[0]]
- return mail_body
- @staticmethod
- def check_sign_zone(mail_body_lines, max_matches=3, max_lines=12):
- """
- verify whether specail zone is a signature zone.
- :param mail_body_lines:
- :return:
- """
- match_list = set()
- for idx in xrange(min(len(mail_body_lines), max_lines)):
- line = mail_body_lines[idx].strip()
- # print "line:", idx, ": ", line
- if line.find(u"。") > 0 or line.find(u',') > 0: # when line is a param, skip search...
- continue
- for _key, _reg in MailSignatureParser.ENTITY_MAP.items(): # loop all entity regex
- fits = _reg.findall(line)
- if len(fits) > 0 and len(fits[0]) > 0: # when match standard format signature info
- # map(lambda x: MailGlobal.logger.debug("check signature zone, key=%s" % x), fits)
- match_list.add(_key)
- break
- else: # when not matched
- flag = False
- for key, reg in MailSignatureParser.REGEX_MAP.items():
- if reg.search(line) is not None:
- match_pos = reg.search(line).span()
- MailGlobal.logger.debug("found signature info, key=%s, value=%s" %
- (key, line[match_pos[0]: match_pos[1]].encode('utf8')))
- match_list.add(key)
- flag = True
- break
- if flag: # break current line
- break
- if len(match_list) >= max_matches:
- return True
- return False
- # @staticmethod
- # def find_sign_zone_v2(mail, max_lines=10, max_matches=3, max_empty_line=3):
- # # check reply mail
- # # mail_body = MailSignatureParser.remove_reply_pos(mail['body'])
- # mail_body = mail['body']
- # # preprocess, split by lines
- # mail_body_lines = MailSignatureParser.preprocess(mail_body)
- # increment_empty_lines = 0
- # sign_zone_start = len(mail_body_lines) - 1
- # for line_no in xrange(len(mail_body_lines) - 1, -1, -1):
- # line = mail_body_lines[line_no].strip()
- # sign_zone_end = line_no
- # sep_match = MailSignatureParser.SINGLE_SEPARATE_LINE_REGEX.search()
- @staticmethod
- def find_sign_zone(mail_body, max_lines=12, max_matches=3, max_empty_line=2):
- """
- :param mail:
- :param max_lines:
- :param max_matches: max matched units number
- :return:
- """
- # check reply mail
- mail_body = MailSignatureParser.remove_reply_pos(mail_body)
- mail_body = mail_body.rstrip("\n\r ")
- # found separate line
- sep_match = MailSignatureParser.SEPARATE_LINE_REGEX.search(mail_body)
- sep_empty_lines = MailSignatureParser.SERIES_EMPTY_LINE_REGEX.search(mail_body)
- if sep_match is None and sep_empty_lines is None:
- return mail_body, not MailSignatureParser.IS_SIGNATURE_MAIL
- if sep_match is not None:
- sep_pos = sep_match.span()[0]
- if MailSignatureParser.check_sign_zone(mail_body[sep_pos:].split("\n")): # verify signature zone
- return mail_body[: sep_pos], MailSignatureParser.IS_SIGNATURE_MAIL
- # preprocess, split by lines
- mail_body_lines = MailSignatureParser.preprocess(mail_body)
- step = min(max_lines, len(mail_body_lines))
- st, ed = len(mail_body_lines) - step, len(mail_body_lines) - 1
- match_list = set()
- last_match_pos = len(mail_body_lines)
- flag_sign_zone = False
- accum_empty_lines = 0
- for idx in xrange(ed, st - 1, -1):
- line = mail_body_lines[idx].strip()
- # print "line:", idx, ": ", line
- if line.find(u"。") > 0 or line.count(u',') > 1: # when line is a param, skip search...
- continue
- if line == "":
- if flag_sign_zone: # accumulate emtpy lines
- accum_empty_lines += 1
- if accum_empty_lines >= max_empty_line: # when continuous empty lines reach to threshold value
- last_match_pos = idx
- break
- continue
- accum_empty_lines = 0
- for _key, _reg in MailSignatureParser.ENTITY_MAP.items():
- fits = _reg.findall(line)
- if len(fits) > 0 and len(fits[0]) > 0: # when match standard format signature info
- last_match_pos = idx
- # map(lambda x: MailGlobal.logger.debug("found signature info, key=%s" % x), fits)
- match_list.add(_key)
- flag_sign_zone = True
- break
- else: # when not matched
- for key, reg in MailSignatureParser.REGEX_MAP.items():
- if reg.search(line) is not None:
- last_match_pos = idx
- match_pos = reg.search(line).span()
- MailGlobal.logger.debug("found signature info, key=%s, value=%s" %
- (key, line[match_pos[0]: match_pos[1]].encode('utf8')))
- match_list.add(key)
- flag_sign_zone = True
- if len(match_list) >= max_matches:
- return u'\n'.join(mail_body_lines[: last_match_pos]), MailSignatureParser.IS_SIGNATURE_MAIL
- # not found signature
- return mail_body, not MailSignatureParser.IS_SIGNATURE_MAIL
- # def parallel_handle_signature(mail_sets, n_jobs=MailGlobal.etc_map['n_jobs']):
- # result = Parallel(n_jobs=n_jobs, verbose=11)(
- # delayed(single_handle_signature)(index, mail['body']) for index, mail in mail_sets.df.iterrows())
- # for item in result:
- # index, mail_body = item
- # mail_sets.df.loc[index, 'body'] = mail_body
- # return mail_sets
- def parallel_handle_signature(mails, n_jobs=MailGlobal.etc_map['n_jobs']):
- verbose = 1 if MailGlobal.DEBUG_MODEL else 0
- result = Parallel(n_jobs=n_jobs, verbose=verbose)(
- delayed(single_handle_signature)(i, mail['body']) for i, mail in enumerate(mails))
- for item in result:
- index, mail_body = item
- mails[index]['body'] = mail_body
- return mails
- def single_handle_signature(index, mail_body, time_it=10):
- try:
- with time_limit(time_it):
- mail_body, _ = MailSignatureParser.find_sign_zone(mail_body)
- except TimeoutException:
- MailGlobal.logger.warn("signature exec time out, index=%s, text=%s" % (index, mail_body.encode('utf8')))
- return index, mail_body
- def metric_signature(mail_file):
- import random
- from mail_io import load_mail_file
- mail_datas = load_mail_file(mail_file, raw_type="dict")
- accuracy_ratio = 0.0
- # import pdb
- random.shuffle(mail_datas)
- for mail in mail_datas:
- # print "\n" * 3, "^" * 40
- # print mail['body']
- # pdb.set_trace()
- mail_body, is_sign = MailSignatureParser.find_sign_zone(mail['body'])
- # print "is sign? : ", is_sign
- # if not is_sign:
- # mail_body, is_sign = MailSignatureParser.find_sign_zone(mail['body'])
- # print "\n++++++++++\n"
- # print mail_body
- accuracy_ratio += is_sign
- accuracy_ratio /= len(mail_datas)
- MailGlobal.logger.info("signature mail metric of accuracy is : %.8f" % accuracy_ratio)
- def check_signature(mail_file):
- import random
- from mail_entity import MailSet
- mail_sets = MailSet()
- mail_sets.load(mail_file, file_format='pkl')
- import pdb
- # for _, mail in mail_sets.df.iterrows():
- while True:
- indics = random.randint(0, mail_sets.length() - 1)
- mail = mail_sets.df.iloc[indics]
- mail_body, is_sign = MailSignatureParser.find_sign_zone(mail['body'])
- if is_sign:
- print "\n" * 3, "^" * 40
- print mail['body']
- pdb.set_trace()
- print "\n++++++++++\n"
- print mail_body
- def sample_sign_mail(mail_files, output):
- import random
- import json
- from mail_io import load_mail_files
- mail_datas = load_mail_files(mail_files)
- with open(output, 'a+') as sign_writer:
- while True:
- mail = random.choice(mail_datas)
- if len(mail['body'].split('\n')) < 8:
- continue
- print "\n" * 3, "^" * 50
- print mail['body']
- while True:
- is_sign = raw_input("is sign email ? [0, 1]: ")
- if is_sign == "":
- is_sign = "0"
- if is_sign == '0' or is_sign == '1':
- break
- if is_sign == "1":
- json_mail = mail.m
- json_mail['is_sign'] = int(is_sign)
- sign_writer.write("%s\n" % json.dumps(json_mail))
- if __name__ == "__main__":
- # sample_sign_mail(["./data/2015-12-11"], "./data/test_sign_mail.json")
- # metric_signature("./data/test_sign_mail.json")
- # check_signature("./data/2015-12-18_token.pkl")
- from mail_preprocess import parse_mail_body
- MailGlobal.logger._debug = True
- mail_body = u"\r\n\r\n\r\n<div class=\"\">\r\n<p class=\"\" style=\"mso-margin-top-alt:auto;mso-margin-bottom-alt:auto\"><span lang=\"EN-US\" style=\"font-family:"Arial","sans-serif";color:black\">Hi</span><span style=\"font-family:宋体;color:black\">,赵丽超,</span><span lang=\"EN-US\"><o:p></o:p></span></p>\r\n<p class=\"\" style=\"mso-margin-top-alt:auto;mso-margin-bottom-alt:auto\"><span style=\"font-family:宋体;color:black\">请在</span><span lang=\"EN-US\" style=\"font-family:"Arial","sans-serif";color:black\"><a target=\"_blank\" href=\"http://url.qmail.com/cgi-bin/safejmp?action=check_link&url=http%3A%2F%2Fikang.com%2F\" target=\"_blank\">http://iKang.com</a></span><span style=\"font-family:宋体;color:black\">或</span><span style=\"font-family:"Arial","sans-serif";color:black\">\r\n<span lang=\"EN-US\"><a target=\"_blank\" href=\"http://url.qmail.com/cgi-bin/safejmp?action=check_link&url=http%3A%2F%2Fm.ikang.com%2F\" target=\"_blank\">http://m.iKang.com</a></span></span><span style=\"font-family:宋体;color:black\">使用以下卡号和密码进行预约体检(无需付费,体检报告会由医院直接送往公司无须另外跟进),可在元旦期间进行体检:</span><span lang=\"EN-US\"><o:p></o:p></span></p>\r\n<p class=\"\" style=\"mso-margin-top-alt:auto;mso-margin-bottom-alt:auto\"><span style=\"font-family:宋体;color:black\">卡号:</span><span lang=\"EN-US\" style=\"font-size:10.0pt;font-family:"Arial","sans-serif";color:black\">1111000022977623</span><span lang=\"EN-US\"><o:p></o:p></span></p>\r\n<p class=\"\" style=\"mso-margin-top-alt:auto;mso-margin-bottom-alt:auto\"><span style=\"font-family:宋体;color:black\">密码:</span><span lang=\"EN-US\" style=\"font-size:10.0pt;font-family:"Arial","sans-serif";color:black\">195555</span><span lang=\"EN-US\"><o:p></o:p></span></p>\r\n<p class=\"\" style=\"mso-margin-top-alt:auto;mso-margin-bottom-alt:auto\"><span style=\"font-family:宋体;color:black\">如有疑问,随时沟通。收到请回复,谢谢!</span><span lang=\"EN-US\"><o:p></o:p></span></p>\r\n<p class=\"\"><span lang=\"EN-US\" style=\"color:#1F497D\"><o:p> </o:p></span></p>\r\n<p class=\"\"><span lang=\"EN-US\" style=\"color:#1F497D\"><o:p> </o:p></span></p>\r\n<p class=\"\"><span style=\"font-size:12.0pt;font-family:"微软雅黑","sans-serif";color:#5F497A\">柏雪宇</span><span lang=\"EN-US\" style=\"font-size:12.0pt;font-family:"微软雅黑","sans-serif";color:#5F497A\"> <o:p></o:p></span></p>\r\n<p class=\"\"><span style=\"font-size:12.0pt;font-family:"微软雅黑","sans-serif";color:#5F497A\">人力资源部</span><span lang=\"EN-US\" style=\"font-size:12.0pt;font-family:"微软雅黑","sans-serif";color:#5F497A\"><o:p></o:p></span></p>\r\n<p class=\"\"><span lang=\"EN-US\" style=\"font-family:宋体;color:#505050\"><o:p> </o:p></span></p>\r\n<p class=\"\"><b><span style=\"font-size:14.0pt;font-family:"微软雅黑","sans-serif";color:#387D39\">奇虎<span lang=\"EN-US\">360<o:p></o:p></span></span></b></p>\r\n<p class=\"\"><b><span style=\"font-size:12.0pt;font-family:华文细黑;color:#505050\">手机:</span></b><span lang=\"EN-US\" style=\"font-family:华文细黑;color:#505050\">13260819821<o:p></o:p></span></p>\r\n<p class=\"\"><b><span style=\"font-size:12.0pt;font-family:华文细黑;color:#505050\">电话:</span></b><span lang=\"EN-US\" style=\"font-family:华文细黑;color:#505050\">010-5244-8212<o:p></o:p></span></p>\r\n<p class=\"\"><b><span style=\"font-size:12.0pt;font-family:华文细黑;color:#505050\">邮件:</span></b><u><span lang=\"EN-US\" style=\"font-family:华文细黑;color:blue\">baixueyu@360.cn</span></u><span lang=\"EN-US\" style=\"font-family:华文细黑;color:#505050\"><o:p></o:p></span></p>\r\n<p class=\"\"><b><span style=\"font-size:12.0pt;font-family:华文细黑;color:#505050\">地址:</span></b><span style=\"font-family:华文细黑;color:#505050\">北京市朝阳区酒仙桥路<span lang=\"EN-US\">6</span>号院(电子城</span><span style=\"font-family:华文细黑;color:#505050\">•</span><span style=\"font-family:华文细黑;color:#505050\">国际电子总部)</span><span lang=\"EN-US\" style=\"font-family:华文细黑;color:#505050\">2</span><span style=\"font-family:华文细黑;color:#505050\">号楼</span><span lang=\"EN-US\" style=\"font-family:华文细黑;color:#505050\"><o:p></o:p></span></p>\r\n<p class=\"\"><span lang=\"EN-US\" style=\"color:#1F497D\"><o:p> </o:p></span></p>\r\n<p class=\"\" align=\"left\" style=\"text-align:left\"><b><span style=\"font-size:10.0pt;font-family:宋体\">发件人<span lang=\"EN-US\">:</span></span></b><span lang=\"EN-US\" style=\"font-size:10.0pt;font-family:宋体\"> hengyiponiu@sina.com [mailto:hengyiponiu@sina.com]\r\n<br>\r\n</span><b><span style=\"font-size:10.0pt;font-family:宋体\">发送时间<span lang=\"EN-US\">:</span></span></b><span lang=\"EN-US\" style=\"font-size:10.0pt;font-family:宋体\"> 2015</span><span style=\"font-size:10.0pt;font-family:宋体\">年<span lang=\"EN-US\">12</span>月<span lang=\"EN-US\">29</span>日<span lang=\"EN-US\">\r\n 11:49<br>\r\n</span><b>收件人<span lang=\"EN-US\">:</span></b><span lang=\"EN-US\"> </span>柏雪宇<span lang=\"EN-US\"><br>\r\n</span><b>主题<span lang=\"EN-US\">:</span></b><span lang=\"EN-US\"> </span>回复:答复<span lang=\"EN-US\">:\r\n</span>回复:欢迎加入<span lang=\"EN-US\">360<o:p></o:p></span></span></p>\r\n<p class=\"\" align=\"left\" style=\"text-align:left\"><span lang=\"EN-US\"><o:p> </o:p></span></p>\r\n<p class=\"\"><span lang=\"EN-US\"><o:p> </o:p></span></p>\r\n</div>\r\n\r\n\r\n\r\n"
- mail_body = parse_mail_body(mail_body)
- print mail_body
- mail_body, is_sign = MailSignatureParser.find_sign_zone(mail_body)
- print is_sign
- print mail_body