PageRenderTime 109ms CodeModel.GetById 24ms RepoModel.GetById 2ms app.codeStats 0ms

/mail_reply_parser.py

https://gitlab.com/yaojian/RenjuAI
Python | 511 lines | 482 code | 17 blank | 12 comment | 22 complexity | bdd1b894032b7c58a3e70e226aedceb3 MD5 | raw file
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # author: yaojian-xy <yaojian-xy@360.cn>
  4. import re
  5. import sys
  6. import hashlib
  7. import time
  8. import simplejson as json
  9. import numpy as np
  10. from sklearn.externals.joblib import Parallel, delayed
  11. from mail_entity import Mail
  12. from mail_global import MailGlobal, TimeoutException, time_limit
  13. from mail_preprocess import advanced_parse_time, parse_address, parse_mail_body
  14. reload(sys)
  15. sys.setdefaultencoding("utf-8")
  16. class MailReplyParser:
  17. KEY_FROM, KEY_TO, KEY_TIME, KEY_CC, KEY_SUBJ = 'from', 'to', 'time', 'cc', 'subj'
  18. HEADER_MAP = {u'发件人': KEY_FROM, u'寄件者': KEY_FROM, u'From': KEY_FROM,
  19. u'发送时间': KEY_TIME, u'寄件日期': KEY_TIME, u'Date': KEY_TIME, u'Sent': KEY_TIME,
  20. u'收件人': KEY_TO, u'收件者': KEY_TO, u'To': KEY_TO,
  21. u'抄送': KEY_CC, u'抄 送': KEY_CC, u'抄送人': KEY_CC, u'副本': KEY_CC, u'Cc': KEY_CC,
  22. u'主题': KEY_SUBJ, u'主 题': KEY_SUBJ, u'主旨': KEY_SUBJ, u'Subject': KEY_SUBJ,
  23. }
  24. HEADER_REGEX = re.compile(u'^[ ]*[>\*]{,3}[ ]*\*?(%s)[::]{1}(.+)' % '|'.join(HEADER_MAP.keys()))
  25. Multi_HEADER_REGEX = re.compile(u'^[ ]*[>\*]{,3}[ ]*\*?(%s)[::]{1}.+' % '|'.join(HEADER_MAP.keys()), re.MULTILINE)
  26. SUBJECT_REGEX = re.compile(u'^[ ]*(回复|答复|回覆|re|转发|Fw)[::]{1}.+', re.IGNORECASE)
  27. HR_LINE_REGEX = re.compile(u'^[ ]*[>\*]{,2}[-]{2,}[ ]*(原始邮件|Original Message)[ ]*[-]{2,}.*', re.IGNORECASE)
  28. ADDR_RM_REG = re.compile(u'\"|mailto:')
  29. ADDR_REG = re.compile(u'^[>|\]]([^<]*)[<|\[](.*)')
  30. FROM_TO, FROM_CC = 0, 1
  31. REPLY_MAIL, RELAY_MAIL = 'Reply', 'Forward'
  32. EXTRACT_FLAG_NORMAL = 0
  33. EXTRACT_FLAG_LACK_INFO = 1
  34. EXTRACT_FLAG_NO_HEADER = 2
  35. HEADER_END_KEY_SET = set([u'发件人', u'寄件者', u'From'])
  36. HEADER_START_KEY_SET = set([u'主题', u'主 题', u'主旨', u'Subject'])
  37. REPLY_STATUS_BODY = 0
  38. REPLY_STATUS_HEADER = 1
  39. REPLY_STATUS_HEADER_END = 2
  40. REG_FILTER_CHAR = re.compile(u"[ \r\n]")
  41. def __init__(self):
  42. pass
  43. @staticmethod
  44. def extract_reply_fragment(mail_body):
  45. """
  46. :param mail_body_lines:
  47. :return: mail body fragments
  48. """
  49. # preprocess mail_body
  50. mail_body = mail_body.replace('\r\n', '\n')
  51. mail_body = re.sub('[\n]{2,}', '\n', mail_body)
  52. # extract reply message
  53. mail_body_lines = mail_body.split('\n')
  54. flag = MailReplyParser.EXTRACT_FLAG_NORMAL
  55. reply_status = MailReplyParser.REPLY_STATUS_BODY
  56. reply_header_past_indics = None
  57. body_fragments = []
  58. # mail_body_lines.reverse() # traverse from end to begin
  59. body_fragment = {'body': {'ed': len(mail_body_lines) - 1, 'st': len(mail_body_lines) - 1}}
  60. found_header = False
  61. for idx in xrange(len(mail_body_lines) - 1, -1, -1):
  62. line = mail_body_lines[idx]
  63. header_pair = MailReplyParser.HEADER_REGEX.findall(line)
  64. if len(header_pair) == 1: # header part
  65. found_header = True
  66. header_nm, head_val = header_pair[0]
  67. if reply_status == MailReplyParser.REPLY_STATUS_BODY: # header part start
  68. if header_nm not in MailReplyParser.HEADER_START_KEY_SET:
  69. continue
  70. reply_status = MailReplyParser.REPLY_STATUS_HEADER
  71. reply_header_past_indics = idx
  72. elif reply_status == MailReplyParser.REPLY_STATUS_HEADER:
  73. for merge_idx in xrange(idx + 1, reply_header_past_indics):
  74. head_val += mail_body_lines[merge_idx]
  75. reply_header_past_indics = idx
  76. if header_nm in MailReplyParser.HEADER_END_KEY_SET:
  77. reply_status = MailReplyParser.REPLY_STATUS_HEADER_END
  78. body_fragment[MailReplyParser.HEADER_MAP[header_nm]] = head_val
  79. else: # text part
  80. if reply_status == MailReplyParser.REPLY_STATUS_HEADER:
  81. continue
  82. elif reply_status == MailReplyParser.REPLY_STATUS_HEADER_END: # header part end
  83. # append fragment
  84. if body_fragment['body']['st'] <= body_fragment['body']['ed'] \
  85. and all(key in body_fragment for key in ('subj', 'body', 'from')): #
  86. body_fragments.append(body_fragment)
  87. else:
  88. MailGlobal.logger.warn("reply msg extraction, lack info: %s" % str(body_fragment.keys()))
  89. flag = MailReplyParser.EXTRACT_FLAG_LACK_INFO
  90. body_fragment = {'body': {'ed': idx, 'st': idx}}
  91. reply_status = MailReplyParser.REPLY_STATUS_BODY
  92. if MailReplyParser.HR_LINE_REGEX.match(line): # split line
  93. body_fragment['body']['ed'] = idx - 1
  94. else:
  95. body_fragment['body']['st'] = idx
  96. if not found_header: # not found any mail header
  97. flag = MailReplyParser.EXTRACT_FLAG_NO_HEADER
  98. body_fragments.append(body_fragment) # append last fragment with only key(body)
  99. # mail_body_lines.reverse() # reverse to original body lines
  100. return mail_body_lines, body_fragments, flag
  101. @staticmethod
  102. def extract_addr(addr, alias_map, log_queue=None, desc=""):
  103. """
  104. :param addr:
  105. :param alias_map:
  106. :return: addr list
  107. """
  108. addr = MailReplyParser.ADDR_RM_REG.sub('', addr)
  109. # find email delim
  110. prob_delim = [u';', u",", u',']
  111. delim = ';'
  112. max_delim_cnt = 0
  113. for dlch in prob_delim:
  114. cnt = addr.count(dlch)
  115. if cnt > max_delim_cnt:
  116. delim = dlch
  117. max_delim_cnt = cnt
  118. # extract address
  119. addr_list = []
  120. for addr_unit in addr.split(delim):
  121. addr_unit = addr_unit.strip()
  122. addr_parsed = parse_address(addr_unit, desc=desc, log_queue=log_queue)
  123. if addr_parsed != "":
  124. addr_list.append(addr_parsed)
  125. if 'n' in addr_parsed and addr_parsed['n'] not in alias_map:
  126. alias_map[addr_parsed['n']] = addr_parsed['a']
  127. else:
  128. alias = addr_unit
  129. if alias in alias_map:
  130. addr_list.append({'a': alias_map[alias], 'n': alias})
  131. log_queue.pop()
  132. map(lambda x: x.setdefault('flag', 0), addr_list)
  133. return addr_list
  134. @staticmethod
  135. def trim(raw_text):
  136. return raw_text.strip(' \r\n')
  137. @staticmethod
  138. def alias_collect(x, alias_map):
  139. if type(x) is dict and 'n' in x:
  140. alias_map[x['n']] = x['a']
  141. @staticmethod
  142. def add_edge(net, addr1, addr2, itype, enhance=0.5):
  143. if (addr1, addr2) not in net:
  144. net[(addr1, addr2)] = [0, 0]
  145. net[(addr1, addr2)][itype] += 1
  146. # enhance reverse edge
  147. # if (addr2, addr1) not in net:
  148. # net[(addr2, addr1)] = [0, 0]
  149. # net[addr2, addr1][itype] += enhance
  150. @staticmethod
  151. def parse_fragment_addr(body_fragments, mail, log_queue=None):
  152. """
  153. :param body_fragments:
  154. :return: extract fragment body
  155. """
  156. # get alias map
  157. alias_map = dict()
  158. MailReplyParser.alias_collect(mail['from'], alias_map)
  159. map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['to'])
  160. map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['cc'])
  161. # extract address
  162. new_body_fragments = []
  163. for i, frag in enumerate(body_fragments):
  164. log_queue.append("")
  165. if 'from' not in frag or ('to' not in frag and 'cc' not in frag):
  166. continue
  167. addr_from = MailReplyParser.extract_addr(frag['from'], alias_map, desc="from", log_queue=log_queue)
  168. if len(addr_from) != 1:
  169. continue
  170. frag['from'] = addr_from[0]
  171. if 'to' in frag:
  172. frag['to'] = MailReplyParser.extract_addr(frag['to'], alias_map, desc="to", log_queue=log_queue)
  173. else:
  174. frag['to'] = []
  175. if 'cc' in frag:
  176. frag['cc'] = MailReplyParser.extract_addr(frag['cc'], alias_map, desc="cc", log_queue=log_queue)
  177. else:
  178. frag['cc'] = []
  179. if len(frag['to']) == 0 and len(frag['cc']) == 0:
  180. continue
  181. new_body_fragments.append(frag)
  182. return new_body_fragments
  183. @staticmethod
  184. def extract_interactive(body_fragments, mail, log_queue=None):
  185. """
  186. :param body_fragments:
  187. :return: extract interactive body
  188. """
  189. # get alias map
  190. alias_map = dict()
  191. MailReplyParser.alias_collect(mail['from'], alias_map)
  192. map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['to'])
  193. map(lambda x: MailReplyParser.alias_collect(x, alias_map), mail['cc'])
  194. # extract address
  195. inter_net = {}
  196. for i, frag in enumerate(body_fragments):
  197. log_queue.append("")
  198. if 'from' not in frag or ('to' not in frag and 'cc' not in frag):
  199. continue
  200. addr_from = MailReplyParser.extract_addr(frag['from'], alias_map, desc="from", log_queue=log_queue)
  201. if len(addr_from) != 1:
  202. continue
  203. addr_from = addr_from[0]['a']
  204. if 'to' in frag:
  205. addr_to = MailReplyParser.extract_addr(frag['to'], alias_map, desc="to", log_queue=log_queue)
  206. map(lambda x: MailReplyParser.add_edge(inter_net, addr_from, x['a'], MailReplyParser.FROM_TO), addr_to)
  207. if 'cc' in frag:
  208. addr_cc = MailReplyParser.extract_addr(frag['cc'], alias_map, desc="cc", log_queue=log_queue)
  209. map(lambda x: MailReplyParser.add_edge(inter_net, addr_from, x['a'], MailReplyParser.FROM_CC), addr_cc)
  210. return inter_net
  211. @staticmethod
  212. def parse_reply(mail, msg_type='all', log_queue=None):
  213. """
  214. parse reply message with,
  215. :param mail:
  216. :param msg_type: [all / last], 'all' means all text messages, 'last' means last text message
  217. :return: (mail reply message and mail interactive relationship)
  218. """
  219. if not MailReplyParser.is_reply(mail): # if not a reply message
  220. return None, None
  221. mail_body_lines, body_fragments, _ = MailReplyParser.extract_reply_fragment(mail['body'])
  222. if msg_type == 'all':
  223. inner_msg = ''
  224. # get all text message
  225. for frag in body_fragments[::-1]:
  226. inner_msg += '\n'.join(mail_body_lines[frag['body']['st']: (frag['body']['ed'] + 1)]) + '\n'
  227. else:
  228. # get last reply message
  229. frag = body_fragments[-1]
  230. inner_msg = '\n'.join(mail_body_lines[frag['body']['st']: (frag['body']['ed'] + 1)])
  231. # extract interactive part
  232. inter_net = MailReplyParser.extract_interactive(body_fragments, mail, log_queue=log_queue)
  233. return inner_msg, inter_net
  234. @staticmethod
  235. def category(mail):
  236. """
  237. check mail category [reply, or relay]
  238. :param mail:
  239. :return: reply mail
  240. """
  241. kw = MailReplyParser.SUBJECT_REGEX.findall(mail['subject'])
  242. if len(kw) == 0:
  243. return MailReplyParser.REPLY_MAIL
  244. if re.match(u'转发|Fw', kw[0], re.IGNORECASE):
  245. return MailReplyParser.RELAY_MAIL
  246. else:
  247. return MailReplyParser.REPLY_MAIL
  248. @staticmethod
  249. def is_reply(mail):
  250. """
  251. check whether mail is a replied mail
  252. :param mail:
  253. :return: True or False
  254. """
  255. # check subject
  256. if MailReplyParser.SUBJECT_REGEX.match(mail['subj']):
  257. return True
  258. # check body
  259. header_match = MailReplyParser.Multi_HEADER_REGEX.findall(mail['body'])
  260. if len(set(header_match)) >= 4: # 4 means (from, to, time, subject)
  261. return True
  262. return False
  263. def batch_reply_parser(mail_datas, n_jobs=MailGlobal.etc_map['n_jobs']):
  264. # n_jobs = (len(mail_datas) / 5000 + 1)
  265. # n_jobs = MailGlobal.etc_map['n_jobs'] if n_jobs > MailGlobal.etc_map['n_jobs'] else n_jobs
  266. mail_map = dict()
  267. mail_links = []
  268. reply_count = np.array([0] * 6)
  269. extract_info = {"has_time": 0.0}
  270. verbose = 1 if MailGlobal.DEBUG_MODEL else 0
  271. result = Parallel(n_jobs=n_jobs, verbose=verbose)(
  272. delayed(single_reply_parse)(index, mail) for index, mail in enumerate(mail_datas))
  273. for _, item in result:
  274. if 'map' in item:
  275. for k, v in item['map'].items():
  276. mail_map.setdefault(k, [])
  277. mail_map[k].extend(v)
  278. if 'link' in item:
  279. mail_links.extend(item['link'])
  280. if 'count' in item:
  281. reply_count += item['count']
  282. if "proto" not in item and "time" in item:
  283. extract_info['has_time'] += 1
  284. MailGlobal.logger.info("extract reply msg mail: %d, including reply number: %d" % tuple(reply_count[0:2]))
  285. MailGlobal.logger.info("unable to extract reply mail: %d, including lacking info mail=%d, "
  286. "no header mail=%d, and (messy code) mail=%d" % tuple(reply_count[2:6]))
  287. MailGlobal.logger.info("extract reply msg mail: time field freq=%d, ratio=%.6f"
  288. % (reply_count[0], extract_info['has_time'] / (reply_count[0] + 0.1)))
  289. # save datas
  290. if MailGlobal.DEBUG_MODEL:
  291. mail_map_count = 0
  292. for _, v in mail_map.items():
  293. mail_map_count += len(v)
  294. MailGlobal.logger.debug("mail map number: %d" % mail_map_count)
  295. MailGlobal.logger.debug("mail link number: %d" % len(mail_links))
  296. mail_reply_datas = []
  297. for md5, mails in mail_map.items():
  298. for mail in mails:
  299. mail_reply_datas.append(mail)
  300. return mail_reply_datas, mail_links
  301. def single_reply_parse(index, json_mail, time_it=10):
  302. def write_log(_mail, _queue):
  303. has_msg = False
  304. for msg in _queue:
  305. if len(msg) > 0:
  306. has_msg = True
  307. break
  308. if has_msg:
  309. MailGlobal.logger._writer("#"*20 + _mail["_id"] + " BEGIN " + "#"*20)
  310. step = 0
  311. if _queue[0] != "": # original error
  312. MailGlobal.logger._writer("# original")
  313. while step < len(_queue) and _queue[step] != "": # log original error
  314. MailGlobal.logger._writer(_queue[step])
  315. step += 1
  316. floor_no = 0
  317. floor_zone = False
  318. while step < len(_queue):
  319. msg = _queue[step]
  320. if msg == "":
  321. floor_zone = False
  322. else:
  323. if not floor_zone:
  324. floor_no += 1
  325. floor_zone = True
  326. MailGlobal.logger._writer("# floor %d" % floor_no)
  327. MailGlobal.logger._writer(msg)
  328. step += 1
  329. MailGlobal.logger._writer("#"*20 + " END " + "#"*20)
  330. def parse_mail_time(mail):
  331. if not 'time' in mail:
  332. mail["time"] = ""
  333. if type(mail['time']) is not float:
  334. mtime = advanced_parse_time(mail['time'])
  335. if mtime is not MailGlobal.INVALID_TIME:
  336. mail['time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(mtime))
  337. else:
  338. mail['time'] = mail['time'].strip()
  339. else:
  340. mail['time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(mail['time']))
  341. def parse_messy_text(mail):
  342. mail['subj'] = mail['subj'].strip(" \r\n")
  343. mail['body'] = mail['body'].strip(" \r\n")
  344. mail['subject'] = mail.pop('subj')
  345. mail['plain'] = mail.pop('body')
  346. if 'attach' in mail:
  347. mail['attachment'] = mail.pop('attach')
  348. def add_mail_map(mail, mail_type="original"):
  349. # parse time
  350. parse_mail_time(mail)
  351. # parse messy code
  352. parse_messy_text(mail)
  353. if "_id" not in mail:
  354. mail["_id"] = hashlib.md5(json.dumps(mail)).hexdigest()
  355. raw_text = MailReplyParser.REG_FILTER_CHAR.sub('', "%s %s" % (mail['subject'], mail['plain'])).encode('utf8')
  356. mail["md5_sp"] = hashlib.md5(raw_text).hexdigest()
  357. # trim mail time
  358. if MailGlobal.time_standard.match(mail["time"]):
  359. raw_time = mail["time"][: -3]
  360. else:
  361. raw_time = mail["time"]
  362. mail["md5_ftsp"] = hashlib.md5("%s %s %s" % (mail["from"]['a'].encode("utf8"), raw_time, raw_text)).hexdigest()
  363. if "mid" not in mail:
  364. mail["mid"] = mail["md5_ftsp"]
  365. key = mail["md5_ftsp"]
  366. if key not in mail_map:
  367. mail_map[key] = [mail]
  368. else:
  369. mail_map[key].append(mail)
  370. mail['type'] = mail_type
  371. mail_map = dict()
  372. try:
  373. with time_limit(time_it):
  374. log_queue = []
  375. mail = Mail(json_mail, log_queue=log_queue, extra=True)
  376. # parse mail body
  377. mail["body"] = parse_mail_body(mail["body"])
  378. mail_links = []
  379. reply_parser = MailGlobal.reply_parser
  380. # reply count (6 elements)
  381. reply_count = np.array([0] * 6)
  382. # MailGlobal.logger.debug("subject: %s" % mail['subj'])
  383. # get mail md5 txt
  384. mail['subj'] = mail['subj'].strip()
  385. mail_body = re.sub('[\n]{2,}', '\n', mail['body'])
  386. mail['body'] = MailReplyParser.trim(mail_body)
  387. # extract mail reply
  388. if reply_parser.is_reply(mail): # reply mail
  389. mail_body_lines, body_fragments, flag = MailReplyParser.extract_reply_fragment(mail['body'])
  390. # mail['flag'] = flag
  391. # handle fragment
  392. if len(body_fragments) == 1: # not extract any fragment
  393. add_mail_map(mail.m)
  394. reply_count[2] += 1
  395. MailGlobal.logger.debug("unable to extract reply msg")
  396. if flag == MailReplyParser.EXTRACT_FLAG_LACK_INFO:
  397. reply_count[3] += 1
  398. elif flag == MailReplyParser.EXTRACT_FLAG_NO_HEADER:
  399. reply_count[4] += 1
  400. if mail['plain'].find(u'\ufffd') >= 0:
  401. reply_count[5] += 1
  402. return index, {'map': mail_map, 'count': reply_count}
  403. # parse address and filter some fragement
  404. parsed_body_fragments = MailReplyParser.parse_fragment_addr(body_fragments, mail, log_queue=log_queue)
  405. # append original
  406. parsed_body_fragments.append(body_fragments[-1])
  407. body_fragments = parsed_body_fragments
  408. # modify last body fragment
  409. mail['body'] = body_fragments[-1]['body']
  410. body_fragments[-1] = mail.m
  411. MailGlobal.logger.debug("extract reply msg number: %d" % (len(body_fragments) - 1))
  412. reply_count[0] += 1
  413. reply_count[1] += len(body_fragments) - 1
  414. for idx, frag in enumerate(body_fragments):
  415. frag['subj'] = frag['subj'].strip()
  416. frag['body'] = '\n'.join(mail_body_lines[frag['body']['st']: (frag['body']['ed'] + 1)])
  417. frag['body'] = MailReplyParser.trim(frag['body'])
  418. mail_type = "original"
  419. if idx < (len(body_fragments) - 1):
  420. mail_type = "extracted"
  421. add_mail_map(frag, mail_type=mail_type)
  422. if idx >= 1:
  423. pre_frag = body_fragments[idx - 1]
  424. link_type = MailReplyParser.category(frag)
  425. mail_links.append({'node1': frag['mid'], 'node2': pre_frag['mid'], 'type': link_type})
  426. # write to log
  427. write_log(mail, log_queue)
  428. return index, {'map': mail_map, 'link': mail_links, 'count': reply_count}
  429. else: # not reply mail
  430. MailGlobal.logger.debug("mail is not reply mail")
  431. add_mail_map(mail.m, mail_type="original")
  432. # write to log
  433. write_log(mail, log_queue)
  434. return index, {'map': mail_map}
  435. except TimeoutException:
  436. MailGlobal.logger.warn("reply parser exec time out, index=%s, text=%s" % (index, mail.m['body'].encode('utf8')))
  437. add_mail_map(mail.m)
  438. return index, {'map': mail_map}
  439. def test():
  440. mail_body = u"数据已修改确认过,详见附件~\r\n\r\n\r\n\r\n成为信息安全领域最信赖的、技术领先的、\r\n国际知名的方案、产品和服务提供商\r\n------------------------------------------------------\r\n人力资源部 冯萍\r\n网神信息技术(北京)股份有限公司 \r\n地址:北京市海淀区上地开拓路7号先锋大厦二段一层(100085)\r\n手机:18618105615\r\n固话:010-62972892\r\n传真:010-62972896(请注明收件人)\r\n客户热线:400-610-8220\r\n邮箱:fengping@legendsec.com\r\n网址:www.legendsec.com\r\n \r\n发件人: 王冠华\r\n发送时间: 2016-03-04 11:55\r\n收件人: fengping@legendsec.com; guohx@legendsec.com\r\n抄送: guomc (guomc@legendsec.com); 郭建华 (guojh@legendsec.com)\r\n主题: HC预算表格更新\r\n \r\n冯萍,宏霞好,\r\n \r\n请把安服与安管中心,供应链,品牌推广,合作发展部,财务和人力行政这几个部门的情况填入附件表格\r\n \r\n不懂的地方可以问问建华怎么填,另你那边涉及到多种用工形式,把各用工形式拆开\r\n \r\n另建华,慕初,宏霞和冯萍你们三个做完了,统一都给慕初一版,慕初统一看一下我们不要拉下网神一个人\r\n \r\n今天下班前需要反馈我\r\n \r\n有任何问题可以随时联络我\r\n \r\n说的不全的慕初和建华直接补充\r\n \r\n \r\n王冠华\r\n人力资源部\r\n \r\n奇虎360\r\n手机:15727302652\r\n电话:010-52448453\r\n邮件:wangguanhua@360.cn\r\n地址:北京市朝阳区酒仙桥路6号院(电子城•国际电子总部)2号楼B座5层\r\n \r\n"
  441. log_queue = []
  442. mail = {'from': {'a': "wangguanhua@360.cn"},
  443. "to": [{'n': u'王冠华', 'a': "wangguanhua@360.cn"}], "cc": []}
  444. mail_body_lines, body_fragments, flag = MailReplyParser.extract_reply_fragment(mail_body)
  445. body_fragments = MailReplyParser.parse_fragment_addr(body_fragments, mail, log_queue=log_queue)
  446. print '\n'.join(log_queue)
  447. print '\n'*3
  448. print "flag = ", flag
  449. for frag in body_fragments:
  450. print frag
  451. # parse address
  452. from mail_preprocess import advanced_parse_time
  453. s = u"2015年12月4日 15:13"
  454. print advanced_parse_time(s)
  455. if __name__ == "__main__":
  456. # test()
  457. # json_mails = json.load(open('reply_test.txt', 'r'))
  458. json_mails = map(lambda line: json.loads(line.strip()), open('reply_test_2.txt', 'r').readlines())
  459. MailGlobal.register_instance('reply_parser', MailGlobal.timeit_v2(lambda: MailReplyParser(),
  460. desc="load reply parser "))
  461. if len(json_mails) > 0:
  462. # mail_datas = load_json_mails(json_mails)
  463. """
  464. mail_reply_datas (type of list)
  465. mail_reply_links (type of list)
  466. """
  467. mail_reply_datas, mail_reply_links = batch_reply_parser(json_mails, n_jobs=1)
  468. print '\n'.join(map(str, mail_reply_datas[:10]))
  469. print "\n\n"
  470. print '\n'.join(map(str, mail_reply_links[:10]))