PageRenderTime 244ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/mail_signature.py

https://gitlab.com/yaojian/RenjuAI
Python | 324 lines | 282 code | 20 blank | 22 comment | 1 complexity | 9c99b4f2cbe5adcdc5e1020eeeabcbba MD5 | raw file
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # author: yaojian-xy <yaojian-xy@360.cn>
  4. import re
  5. from sklearn.externals.joblib import Parallel, delayed
  6. from mail_global import MailGlobal, TimeoutException, time_limit
  7. class MailSignatureParser:
  8. """
  9. extract mail signature, including: person names, address, phone, fax, zipcode and so on
  10. """
  11. CHINESE_SURNAME = u"李王张刘陈杨赵黄周吴徐孙胡朱高林何郭马罗梁宋郑谢韩唐冯于董萧程曹袁邓许傅沈曾彭吕\
  12. 苏卢蒋蔡贾丁魏薛叶阎余潘杜戴夏钟汪田任姜范方石姚谭廖邹熊金陆郝孔白崔康毛邱秦江史\
  13. 顾侯邵孟龙万段雷钱汤尹黎易常武乔贺赖龚文"
  14. KEY_EMAIL, KEY_PHONE, KEY_ADDRESS, KEY_ZIP_CODE, KEY_COMPANY = 'email', 'phone', 'address', 'zipcode', 'company'
  15. KEY_WEBSITE, KEY_PERSON, KEY_OTHER = 'website', 'person', 'other'
  16. ENTITY_MAP = {KEY_ADDRESS: u"总\s*部|地\s*址|addr?",
  17. KEY_PHONE: u"电\s*话|手\s*机|手机号码|Tel|座机|Mobile",
  18. KEY_EMAIL: u"E-?mail",
  19. KEY_ZIP_CODE: u"邮\s*编|PC\.",
  20. KEY_PERSON: u"联系人|Contact",
  21. KEY_WEBSITE: u"网\s*站|website",
  22. KEY_OTHER: u"Q\s*Q|飞\s*信",
  23. }
  24. # ENTITY_MAP = dict((_v, k) for _v in v.split('|') for k, v in ENTITY_MAP.items())
  25. for k, v in ENTITY_MAP.items():
  26. ENTITY_MAP[k] = re.compile(u'^[ >\*]*(%s)[::]' % v, re.IGNORECASE)
  27. # ENTITY_REGEX = re.compile(u'(%s)[::]' % '|'.join(ENTITY_MAP.keys()), re.IGNORECASE)
  28. # ENTITY_REGEX = re.compile(u'^[ >\*]*(%s)[::]' % '|'.join(ENTITY_MAP.keys()), re.IGNORECASE)
  29. REG_EMAIL = MailGlobal.email_reg_2
  30. # ex: http://www.hylandslaw.com or www.hylandslaw.com
  31. REG_URL = re.compile(u"(?:http[s]?://|www\.){1}(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
  32. # ex: ["000-000-0000", "000 000 0000", "+86 10 6588 8825", "+86-21-60410082"]
  33. REG_PHONE = re.compile(
  34. u"(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}|\d{2}[-\s]?\d{2}[-\s]?\d{4}[-\s]?\d{4})")
  35. REG_COMPANY = re.compile(u"[:: ]*([\u4e00-\u9fff|\u0020-\u00ff]{2,}公司)")
  36. REG_ADDRESS = re.compile(u"(?:.+省)?(?:.+市)?.+区(?:.+路)?(?:.+号)?")
  37. REG_ZIP_CODE = re.compile(u"(?:邮\s*编|PC\.)\s*[::]?\s*\d{6}")
  38. # REG_PERSON = re.compile(u"(?:(联系人|Contact|)\s*[::]?\s*[%s][\u4e00-\u9fa5]{1,2})\
  39. # |(?:^[%s][\u4e00-\u9fa5]{1,2}$)" % (CHINESE_SURNAME, CHINESE_SURNAME))
  40. REG_PERSON = re.compile(u"(^|[\s::]+)[%s]([\u4e00-\u9fa5]{1,2}|\s{1,2}[\u4e00-\u9fa5])($|\s+)" % CHINESE_SURNAME)
  41. # REG_PERSON = re.compile(u"((?:联系人|Contact)\s*[::][%s][\u4e00-\u9fa5]{1,2}|\
  42. # (?:联系人|Contact)\s*[:][a-zA-Z]+\s[a-zA-Z]+|\
  43. # ^[%s][\u4e00-\u9fa5]{1,2}$|^[a-zA-Z]+\s[a-zA-Z]+$)"
  44. # % (CHINESE_SURNAME, CHINESE_SURNAME))
  45. REGEX_MAP = {KEY_EMAIL: REG_EMAIL,
  46. KEY_WEBSITE: REG_URL,
  47. KEY_PHONE: REG_PHONE,
  48. KEY_COMPANY: REG_COMPANY,
  49. KEY_ZIP_CODE: REG_ZIP_CODE,
  50. KEY_ADDRESS: REG_ADDRESS,
  51. KEY_PERSON: REG_PERSON,
  52. }
  53. SEPARATE_LINE_REGEX = re.compile(u"^\s*(?:-|_|—){4,}\s*$", re.MULTILINE)
  54. SERIES_EMPTY_LINE_REGEX = re.compile(u"(?:\n *){2,}")
  55. REPLACE_REGEX = re.compile('&nbsp;|\"')
  56. REPLY_REGEX = re.compile(u'^\s*[>\*]{,3}\s*\*?(发件人|寄件者|From)[::]{1}.+', re.MULTILINE)
  57. REPLY_SEPARATE_REGEX = re.compile(u"^\s*(?:-|—){4,}\s*(?:原始邮件|Original Message)\s*(?:-|—){4,}\s*$",
  58. re.MULTILINE | re.IGNORECASE)
  59. IS_SIGNATURE_MAIL = True
  60. def __init__(self):
  61. pass
  62. @staticmethod
  63. def preprocess(mail_body):
  64. """
  65. preprocess
  66. :param mail_body:
  67. :return: mail body lines
  68. """
  69. body_stream = re.sub('&nbsp;|\"', '', mail_body)
  70. body_stream = body_stream.replace('\r\n', '\n')
  71. body_stream = MailSignatureParser.SERIES_EMPTY_LINE_REGEX.sub('\n\n', body_stream)
  72. mail_body_lines = body_stream.split('\n')
  73. return mail_body_lines
  74. @staticmethod
  75. def remove_reply_pos(mail_body):
  76. match_span = MailSignatureParser.REPLY_SEPARATE_REGEX.search(mail_body)
  77. if match_span is not None:
  78. return mail_body[: match_span.span()[0]]
  79. match_span = MailSignatureParser.REPLY_REGEX.search(mail_body)
  80. if match_span is not None:
  81. return mail_body[: match_span.span()[0]]
  82. return mail_body
  83. @staticmethod
  84. def check_sign_zone(mail_body_lines, max_matches=3, max_lines=12):
  85. """
  86. verify whether specail zone is a signature zone.
  87. :param mail_body_lines:
  88. :return:
  89. """
  90. match_list = set()
  91. for idx in xrange(min(len(mail_body_lines), max_lines)):
  92. line = mail_body_lines[idx].strip()
  93. # print "line:", idx, ": ", line
  94. if line.find(u"。") > 0 or line.find(u',') > 0: # when line is a param, skip search...
  95. continue
  96. for _key, _reg in MailSignatureParser.ENTITY_MAP.items(): # loop all entity regex
  97. fits = _reg.findall(line)
  98. if len(fits) > 0 and len(fits[0]) > 0: # when match standard format signature info
  99. # map(lambda x: MailGlobal.logger.debug("check signature zone, key=%s" % x), fits)
  100. match_list.add(_key)
  101. break
  102. else: # when not matched
  103. flag = False
  104. for key, reg in MailSignatureParser.REGEX_MAP.items():
  105. if reg.search(line) is not None:
  106. match_pos = reg.search(line).span()
  107. MailGlobal.logger.debug("found signature info, key=%s, value=%s" %
  108. (key, line[match_pos[0]: match_pos[1]].encode('utf8')))
  109. match_list.add(key)
  110. flag = True
  111. break
  112. if flag: # break current line
  113. break
  114. if len(match_list) >= max_matches:
  115. return True
  116. return False
  117. # @staticmethod
  118. # def find_sign_zone_v2(mail, max_lines=10, max_matches=3, max_empty_line=3):
  119. # # check reply mail
  120. # # mail_body = MailSignatureParser.remove_reply_pos(mail['body'])
  121. # mail_body = mail['body']
  122. # # preprocess, split by lines
  123. # mail_body_lines = MailSignatureParser.preprocess(mail_body)
  124. # increment_empty_lines = 0
  125. # sign_zone_start = len(mail_body_lines) - 1
  126. # for line_no in xrange(len(mail_body_lines) - 1, -1, -1):
  127. # line = mail_body_lines[line_no].strip()
  128. # sign_zone_end = line_no
  129. # sep_match = MailSignatureParser.SINGLE_SEPARATE_LINE_REGEX.search()
  130. @staticmethod
  131. def find_sign_zone(mail_body, max_lines=12, max_matches=3, max_empty_line=2):
  132. """
  133. :param mail:
  134. :param max_lines:
  135. :param max_matches: max matched units number
  136. :return:
  137. """
  138. # check reply mail
  139. mail_body = MailSignatureParser.remove_reply_pos(mail_body)
  140. mail_body = mail_body.rstrip("\n\r ")
  141. # found separate line
  142. sep_match = MailSignatureParser.SEPARATE_LINE_REGEX.search(mail_body)
  143. sep_empty_lines = MailSignatureParser.SERIES_EMPTY_LINE_REGEX.search(mail_body)
  144. if sep_match is None and sep_empty_lines is None:
  145. return mail_body, not MailSignatureParser.IS_SIGNATURE_MAIL
  146. if sep_match is not None:
  147. sep_pos = sep_match.span()[0]
  148. if MailSignatureParser.check_sign_zone(mail_body[sep_pos:].split("\n")): # verify signature zone
  149. return mail_body[: sep_pos], MailSignatureParser.IS_SIGNATURE_MAIL
  150. # preprocess, split by lines
  151. mail_body_lines = MailSignatureParser.preprocess(mail_body)
  152. step = min(max_lines, len(mail_body_lines))
  153. st, ed = len(mail_body_lines) - step, len(mail_body_lines) - 1
  154. match_list = set()
  155. last_match_pos = len(mail_body_lines)
  156. flag_sign_zone = False
  157. accum_empty_lines = 0
  158. for idx in xrange(ed, st - 1, -1):
  159. line = mail_body_lines[idx].strip()
  160. # print "line:", idx, ": ", line
  161. if line.find(u"。") > 0 or line.count(u',') > 1: # when line is a param, skip search...
  162. continue
  163. if line == "":
  164. if flag_sign_zone: # accumulate emtpy lines
  165. accum_empty_lines += 1
  166. if accum_empty_lines >= max_empty_line: # when continuous empty lines reach to threshold value
  167. last_match_pos = idx
  168. break
  169. continue
  170. accum_empty_lines = 0
  171. for _key, _reg in MailSignatureParser.ENTITY_MAP.items():
  172. fits = _reg.findall(line)
  173. if len(fits) > 0 and len(fits[0]) > 0: # when match standard format signature info
  174. last_match_pos = idx
  175. # map(lambda x: MailGlobal.logger.debug("found signature info, key=%s" % x), fits)
  176. match_list.add(_key)
  177. flag_sign_zone = True
  178. break
  179. else: # when not matched
  180. for key, reg in MailSignatureParser.REGEX_MAP.items():
  181. if reg.search(line) is not None:
  182. last_match_pos = idx
  183. match_pos = reg.search(line).span()
  184. MailGlobal.logger.debug("found signature info, key=%s, value=%s" %
  185. (key, line[match_pos[0]: match_pos[1]].encode('utf8')))
  186. match_list.add(key)
  187. flag_sign_zone = True
  188. if len(match_list) >= max_matches:
  189. return u'\n'.join(mail_body_lines[: last_match_pos]), MailSignatureParser.IS_SIGNATURE_MAIL
  190. # not found signature
  191. return mail_body, not MailSignatureParser.IS_SIGNATURE_MAIL
  192. # def parallel_handle_signature(mail_sets, n_jobs=MailGlobal.etc_map['n_jobs']):
  193. # result = Parallel(n_jobs=n_jobs, verbose=11)(
  194. # delayed(single_handle_signature)(index, mail['body']) for index, mail in mail_sets.df.iterrows())
  195. # for item in result:
  196. # index, mail_body = item
  197. # mail_sets.df.loc[index, 'body'] = mail_body
  198. # return mail_sets
  199. def parallel_handle_signature(mails, n_jobs=MailGlobal.etc_map['n_jobs']):
  200. verbose = 1 if MailGlobal.DEBUG_MODEL else 0
  201. result = Parallel(n_jobs=n_jobs, verbose=verbose)(
  202. delayed(single_handle_signature)(i, mail['body']) for i, mail in enumerate(mails))
  203. for item in result:
  204. index, mail_body = item
  205. mails[index]['body'] = mail_body
  206. return mails
  207. def single_handle_signature(index, mail_body, time_it=10):
  208. try:
  209. with time_limit(time_it):
  210. mail_body, _ = MailSignatureParser.find_sign_zone(mail_body)
  211. except TimeoutException:
  212. MailGlobal.logger.warn("signature exec time out, index=%s, text=%s" % (index, mail_body.encode('utf8')))
  213. return index, mail_body
  214. def metric_signature(mail_file):
  215. import random
  216. from mail_io import load_mail_file
  217. mail_datas = load_mail_file(mail_file, raw_type="dict")
  218. accuracy_ratio = 0.0
  219. # import pdb
  220. random.shuffle(mail_datas)
  221. for mail in mail_datas:
  222. # print "\n" * 3, "^" * 40
  223. # print mail['body']
  224. # pdb.set_trace()
  225. mail_body, is_sign = MailSignatureParser.find_sign_zone(mail['body'])
  226. # print "is sign? : ", is_sign
  227. # if not is_sign:
  228. # mail_body, is_sign = MailSignatureParser.find_sign_zone(mail['body'])
  229. # print "\n++++++++++\n"
  230. # print mail_body
  231. accuracy_ratio += is_sign
  232. accuracy_ratio /= len(mail_datas)
  233. MailGlobal.logger.info("signature mail metric of accuracy is : %.8f" % accuracy_ratio)
  234. def check_signature(mail_file):
  235. import random
  236. from mail_entity import MailSet
  237. mail_sets = MailSet()
  238. mail_sets.load(mail_file, file_format='pkl')
  239. import pdb
  240. # for _, mail in mail_sets.df.iterrows():
  241. while True:
  242. indics = random.randint(0, mail_sets.length() - 1)
  243. mail = mail_sets.df.iloc[indics]
  244. mail_body, is_sign = MailSignatureParser.find_sign_zone(mail['body'])
  245. if is_sign:
  246. print "\n" * 3, "^" * 40
  247. print mail['body']
  248. pdb.set_trace()
  249. print "\n++++++++++\n"
  250. print mail_body
  251. def sample_sign_mail(mail_files, output):
  252. import random
  253. import json
  254. from mail_io import load_mail_files
  255. mail_datas = load_mail_files(mail_files)
  256. with open(output, 'a+') as sign_writer:
  257. while True:
  258. mail = random.choice(mail_datas)
  259. if len(mail['body'].split('\n')) < 8:
  260. continue
  261. print "\n" * 3, "^" * 50
  262. print mail['body']
  263. while True:
  264. is_sign = raw_input("is sign email ? [0, 1]: ")
  265. if is_sign == "":
  266. is_sign = "0"
  267. if is_sign == '0' or is_sign == '1':
  268. break
  269. if is_sign == "1":
  270. json_mail = mail.m
  271. json_mail['is_sign'] = int(is_sign)
  272. sign_writer.write("%s\n" % json.dumps(json_mail))
  273. if __name__ == "__main__":
  274. # sample_sign_mail(["./data/2015-12-11"], "./data/test_sign_mail.json")
  275. # metric_signature("./data/test_sign_mail.json")
  276. # check_signature("./data/2015-12-18_token.pkl")
  277. from mail_preprocess import parse_mail_body
  278. MailGlobal.logger._debug = True
  279. mail_body = u"\r\n\r\n\r\n<div class=\"\">\r\n<p class=\"\" style=\"mso-margin-top-alt:auto;mso-margin-bottom-alt:auto\"><span lang=\"EN-US\" style=\"font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:black\">Hi</span><span style=\"font-family:宋体;color:black\">,赵丽超,</span><span lang=\"EN-US\"><o:p></o:p></span></p>\r\n<p class=\"\" style=\"mso-margin-top-alt:auto;mso-margin-bottom-alt:auto\"><span style=\"font-family:宋体;color:black\">请在</span><span lang=\"EN-US\" style=\"font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:black\"><a target=\"_blank\" href=\"http://url.qmail.com/cgi-bin/safejmp?action=check_link&amp;url=http%3A%2F%2Fikang.com%2F\" target=\"_blank\">http://iKang.com</a></span><span style=\"font-family:宋体;color:black\">或</span><span style=\"font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:black\">\r\n<span lang=\"EN-US\"><a target=\"_blank\" href=\"http://url.qmail.com/cgi-bin/safejmp?action=check_link&amp;url=http%3A%2F%2Fm.ikang.com%2F\" target=\"_blank\">http://m.iKang.com</a></span></span><span style=\"font-family:宋体;color:black\">使用以下卡号和密码进行预约体检(无需付费,体检报告会由医院直接送往公司无须另外跟进),可在元旦期间进行体检:</span><span lang=\"EN-US\"><o:p></o:p></span></p>\r\n<p class=\"\" style=\"mso-margin-top-alt:auto;mso-margin-bottom-alt:auto\"><span style=\"font-family:宋体;color:black\">卡号:</span><span lang=\"EN-US\" style=\"font-size:10.0pt;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:black\">1111000022977623</span><span lang=\"EN-US\"><o:p></o:p></span></p>\r\n<p class=\"\" style=\"mso-margin-top-alt:auto;mso-margin-bottom-alt:auto\"><span style=\"font-family:宋体;color:black\">密码:</span><span lang=\"EN-US\" style=\"font-size:10.0pt;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:black\">195555</span><span lang=\"EN-US\"><o:p></o:p></span></p>\r\n<p class=\"\" style=\"mso-margin-top-alt:auto;mso-margin-bottom-alt:auto\"><span style=\"font-family:宋体;color:black\">如有疑问,随时沟通。收到请回复,谢谢!</span><span lang=\"EN-US\"><o:p></o:p></span></p>\r\n<p class=\"\"><span lang=\"EN-US\" style=\"color:#1F497D\"><o:p>&nbsp;</o:p></span></p>\r\n<p class=\"\"><span lang=\"EN-US\" style=\"color:#1F497D\"><o:p>&nbsp;</o:p></span></p>\r\n<p class=\"\"><span style=\"font-size:12.0pt;font-family:&quot;微软雅黑&quot;,&quot;sans-serif&quot;;color:#5F497A\">柏雪宇</span><span lang=\"EN-US\" style=\"font-size:12.0pt;font-family:&quot;微软雅黑&quot;,&quot;sans-serif&quot;;color:#5F497A\">&nbsp;<o:p></o:p></span></p>\r\n<p class=\"\"><span style=\"font-size:12.0pt;font-family:&quot;微软雅黑&quot;,&quot;sans-serif&quot;;color:#5F497A\">人力资源部</span><span lang=\"EN-US\" style=\"font-size:12.0pt;font-family:&quot;微软雅黑&quot;,&quot;sans-serif&quot;;color:#5F497A\"><o:p></o:p></span></p>\r\n<p class=\"\"><span lang=\"EN-US\" style=\"font-family:宋体;color:#505050\"><o:p>&nbsp;</o:p></span></p>\r\n<p class=\"\"><b><span style=\"font-size:14.0pt;font-family:&quot;微软雅黑&quot;,&quot;sans-serif&quot;;color:#387D39\">奇虎<span lang=\"EN-US\">360<o:p></o:p></span></span></b></p>\r\n<p class=\"\"><b><span style=\"font-size:12.0pt;font-family:华文细黑;color:#505050\">手机:</span></b><span lang=\"EN-US\" style=\"font-family:华文细黑;color:#505050\">13260819821<o:p></o:p></span></p>\r\n<p class=\"\"><b><span style=\"font-size:12.0pt;font-family:华文细黑;color:#505050\">电话:</span></b><span lang=\"EN-US\" style=\"font-family:华文细黑;color:#505050\">010-5244-8212<o:p></o:p></span></p>\r\n<p class=\"\"><b><span style=\"font-size:12.0pt;font-family:华文细黑;color:#505050\">邮件:</span></b><u><span lang=\"EN-US\" style=\"font-family:华文细黑;color:blue\">baixueyu@360.cn</span></u><span lang=\"EN-US\" style=\"font-family:华文细黑;color:#505050\"><o:p></o:p></span></p>\r\n<p class=\"\"><b><span style=\"font-size:12.0pt;font-family:华文细黑;color:#505050\">地址:</span></b><span style=\"font-family:华文细黑;color:#505050\">北京市朝阳区酒仙桥路<span lang=\"EN-US\">6</span>号院(电子城</span><span style=\"font-family:华文细黑;color:#505050\">&#8226;</span><span style=\"font-family:华文细黑;color:#505050\">国际电子总部)</span><span lang=\"EN-US\" style=\"font-family:华文细黑;color:#505050\">2</span><span style=\"font-family:华文细黑;color:#505050\">号楼</span><span lang=\"EN-US\" style=\"font-family:华文细黑;color:#505050\"><o:p></o:p></span></p>\r\n<p class=\"\"><span lang=\"EN-US\" style=\"color:#1F497D\"><o:p>&nbsp;</o:p></span></p>\r\n<p class=\"\" align=\"left\" style=\"text-align:left\"><b><span style=\"font-size:10.0pt;font-family:宋体\">发件人<span lang=\"EN-US\">:</span></span></b><span lang=\"EN-US\" style=\"font-size:10.0pt;font-family:宋体\"> hengyiponiu@sina.com [mailto:hengyiponiu@sina.com]\r\n<br>\r\n</span><b><span style=\"font-size:10.0pt;font-family:宋体\">发送时间<span lang=\"EN-US\">:</span></span></b><span lang=\"EN-US\" style=\"font-size:10.0pt;font-family:宋体\"> 2015</span><span style=\"font-size:10.0pt;font-family:宋体\">年<span lang=\"EN-US\">12</span>月<span lang=\"EN-US\">29</span>日<span lang=\"EN-US\">\r\n 11:49<br>\r\n</span><b>收件人<span lang=\"EN-US\">:</span></b><span lang=\"EN-US\"> </span>柏雪宇<span lang=\"EN-US\"><br>\r\n</span><b>主题<span lang=\"EN-US\">:</span></b><span lang=\"EN-US\"> </span>回复:答复<span lang=\"EN-US\">:\r\n</span>回复:欢迎加入<span lang=\"EN-US\">360<o:p></o:p></span></span></p>\r\n<p class=\"\" align=\"left\" style=\"text-align:left\"><span lang=\"EN-US\"><o:p>&nbsp;</o:p></span></p>\r\n<p class=\"\"><span lang=\"EN-US\"><o:p>&nbsp;</o:p></span></p>\r\n</div>\r\n\r\n\r\n\r\n"
  280. mail_body = parse_mail_body(mail_body)
  281. print mail_body
  282. mail_body, is_sign = MailSignatureParser.find_sign_zone(mail_body)
  283. print is_sign
  284. print mail_body