PageRenderTime 30ms CodeModel.GetById 1ms RepoModel.GetById 0ms app.codeStats 0ms

/mail_spam.py

https://gitlab.com/yaojian/RenjuAI
Python | 500 lines | 467 code | 20 blank | 13 comment | 20 complexity | 24efc4ef22f448a86abd597ea3054f58 MD5 | raw file
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # author: yaojian-xy <yaojian-xy@360.cn>
  4. import os
  5. import cPickle
  6. import json
  7. import re
  8. import numpy as np
  9. from sklearn.naive_bayes import MultinomialNB
  10. from acora import AcoraBuilder
  11. from mail_global import MailGlobal
  12. from mail_io import load_json_file
  13. from mail_util import SparseMatrixHandle
  14. class SpamMailClassifer:
  15. SPAM_YES = 1
  16. SPAM_NO = 0
  17. def __init__(self):
  18. self.spam_model_file = MailGlobal.etc_map['spam_model']
  19. if not os.path.exists(self.spam_model_file):
  20. self._model = {'cn': BayesClassifer(), 'en': BayesClassifer()}
  21. else:
  22. self._model = cPickle.load(open(self.spam_model_file, 'rb'))
  23. def dump(self):
  24. cPickle.dump(self._model, open(self.spam_model_file, 'wb'))
  25. def partial_fit(self, mail_sets, lang='cn'):
  26. if mail_sets.length() == 0:
  27. return
  28. subj_feat = MailGlobal.feature.transform(mail_sets['subj_tok'], field="subj", lang=lang)
  29. body_feat = MailGlobal.feature.transform(mail_sets['body_tok'], field="body", lang=lang)
  30. mail_feat = SparseMatrixHandle.csr_happend((subj_feat, body_feat))
  31. MailGlobal.logger.info('transform %s spam labeled mails %s' % (lang, str(mail_feat.shape)))
  32. self._model[lang].partial_fit(mail_feat, mail_sets['label'])
  33. def predict(self, mail_sets, prob=False, lang='cn'):
  34. if mail_sets.length() == 0:
  35. return []
  36. subj_feat = MailGlobal.feature.transform(mail_sets['subj_tok'], field="subj", lang=lang)
  37. body_feat = MailGlobal.feature.transform(mail_sets['body_tok'], field="body", lang=lang)
  38. mail_feat = SparseMatrixHandle.csr_happend((subj_feat, body_feat))
  39. prob_y = self._model[lang].predict(mail_feat, prob=prob)
  40. return prob_y
  41. class BayesClassifer:
  42. def __init__(self):
  43. self._model = MultinomialNB()
  44. self._classes = [SpamMailClassifer.SPAM_YES, SpamMailClassifer.SPAM_NO]
  45. def partial_fit(self, features, labels):
  46. self._model.partial_fit(features, labels, classes=self._classes)
  47. def predict(self, mail_feat, prob=False):
  48. if prob:
  49. prob_y = self._model.predict_proba(mail_feat)
  50. idx = np.argmax(prob_y, axis=1)
  51. return [(1 - prob_y[i][0]) if idx[i] == 0 else prob_y[i][1] for i in xrange(len(idx))]
  52. else:
  53. return self._model.predict(mail_feat)
  54. def cv(mail_sets, k_fold=5, lang='cn'):
  55. from sklearn.cross_validation import StratifiedKFold
  56. from sklearn.metrics import roc_auc_score
  57. if mail_sets.length() < k_fold:
  58. return
  59. mail_labels = mail_sets.df['label'].values
  60. # class sample count
  61. uniq_val, uniq_cnt = np.unique(mail_labels, return_counts=True)
  62. MailGlobal.logger.info("sample classes: %s, class count: %s" % (uniq_val, uniq_cnt,))
  63. # transform datas
  64. mail_subj_feat = MailGlobal.feature.transform(mail_sets.df['subj_tok'], field="subj", lang=lang)
  65. mail_body_feat = MailGlobal.feature.transform(mail_sets.df['body_tok'], field="body", lang=lang)
  66. mail_feat = SparseMatrixHandle.csr_happend((mail_subj_feat, mail_body_feat)).tocsr()
  67. # cross validation
  68. skf = StratifiedKFold(mail_labels, n_folds=k_fold)
  69. for i, (train_idx, test_idx) in enumerate(skf):
  70. train_X, train_y = mail_feat[train_idx], mail_labels[train_idx]
  71. test_X, test_y = mail_feat[test_idx], mail_labels[test_idx]
  72. MailGlobal.logger.info("cv_%d, train shape: %s, test shape: %s" % (i, train_X.shape, test_X.shape,))
  73. cv_model = BayesClassifer()
  74. cv_model.partial_fit(train_X, train_y)
  75. prob_y = cv_model.predict(test_X, prob=True)
  76. score = roc_auc_score(test_y, prob_y)
  77. MailGlobal.logger.info("auc score: %.8f" % score)
  78. class RuleBasedModel:
  79. def __init__(self, rule_file=None, alpha=0.5):
  80. if rule_file is None:
  81. rule_file = MailGlobal.etc_map['spam_rule_file']
  82. self.rule_file = rule_file
  83. self.alpha = alpha
  84. rule_data = load_json_file(rule_file)
  85. if rule_data is not None:
  86. self.regex_rules = RuleBasedModel.regex_rule_data(rule_data)
  87. else: # default value
  88. self.regex_rules = {'subj_kws': dict(), 'body_kws': dict(), 'subj_reg': None, 'body_reg': None}
  89. @staticmethod
  90. def regex_rule_data(rules_dict):
  91. """
  92. parse rule data to regex object
  93. :return:
  94. """
  95. # regex for subject and body
  96. regex_rules = dict()
  97. regex_rules['subj_kws'] = dict((obj['key'], obj['score']) for obj in rules_dict['subject_rule_kw'])
  98. regex_rules['body_kws'] = dict((obj['key'], obj['score']) for obj in rules_dict['body_rule_kw'])
  99. subj_builder = AcoraBuilder()
  100. body_builder = AcoraBuilder()
  101. map(lambda x: subj_builder.add(x), regex_rules['subj_kws'].keys())
  102. map(lambda x: body_builder.add(x), regex_rules['body_kws'].keys())
  103. regex_rules['subj_reg'] = subj_builder.build()
  104. regex_rules['body_reg'] = body_builder.build()
  105. # regex for white list and black list
  106. return regex_rules
  107. def score_mail_by_rule(self, mail):
  108. score_subj = 0.0
  109. if self.regex_rules['subj_reg'] is not None:
  110. for item in self.regex_rules['subj_reg'].findall(mail['subj']):
  111. key, count = item
  112. score_subj += self.regex_rules['subj_kws'][key] * count
  113. score_body = 0.0
  114. if self.regex_rules['body_reg'] is not None:
  115. for item in self.regex_rules['body_reg'].findall(mail['body']):
  116. key, count = item
  117. score_body += self.regex_rules['body_kws'][key] * count
  118. score = self.alpha * score_subj + (1 - self.alpha) * score_body
  119. return score
  120. def predict(self, mail, hits_score=None):
  121. """
  122. @:param hits_score: recognize as spam email when score larger than "hits_score"
  123. """
  124. score = self.score_mail_by_rule(mail)
  125. if hits_score is not None:
  126. return SpamMailClassifer.SPAM_YES if score > hits_score else SpamMailClassifer.SPAM_NO
  127. else:
  128. return score
  129. # chinese rule_based model
  130. cn_rule_model = RuleBasedModel(rule_file=MailGlobal.etc_map['spam_rule_file'])
  131. class BlendClassifer:
  132. """
  133. multiple factors considered, such as:
  134. rule-based : text keywords, url rule
  135. graph-based: email response style, like only send but not receive
  136. ml-based : corpus-based supervised machine learning
  137. blacklist : black list (dns or just email address)
  138. """
  139. def __init__(self, rule_file=MailGlobal.etc_map['spam_rule_file']):
  140. self.ml_clf = BayesClassifer()
  141. self.rules = cn_rule_model
  142. def fit(self, mail_feat, labels):
  143. self.ml_clf.fit(mail_feat, labels)
  144. def predict(self, mail, prob=False, smooth=0.00001):
  145. sender = mail['from']
  146. # recognize from blacklist
  147. sender_dns = sender[(sender.find('@') + 1):]
  148. if sender_dns in self.blacklist['dns'] or sender in self.blacklist['addr']:
  149. return SpamMailClassifer.SPAM_YES
  150. # recognize from graph net
  151. num_to_from, num_cc_from, num_in_attach = MailGlobal.graphnet.in_degree(sender)
  152. num_from_to, num_from_cc, num_out_attach = MailGlobal.graphnet.out_degree(sender)
  153. ratio_from = num_from_cc / (num_cc_from + smooth)
  154. ratio_attach = num_out_attach / (num_in_attach + smooth)
  155. # recognize from un-token text rules
  156. rule_score_subj = 0.0
  157. for key in self.rules['subj_reg'].findall(mail['subj']):
  158. rule_score_subj += self.rules['subj'][key]
  159. rule_score_body = 0.0
  160. for key in self.rules['body_reg'].findall(mail['body']):
  161. rule_score_body += self.rules['body'][key]
  162. # recognize from
  163. ml_score = self.ml_clf.predict(mail)
  164. # combine
  165. # TODO
  166. return 0
  167. def select_mails(mail_files, output_prefix):
  168. """
  169. :param mail_file:
  170. :param output_prefix:
  171. :return:
  172. """
  173. import hashlib
  174. from mail_util import max_min_scale
  175. from mail_io import load_mail_file
  176. from mail_preprocess import check_language
  177. lang_cn, lang_en = MailGlobal.MAIL_LANG_CN, MailGlobal.MAIL_LANG_EN
  178. def load_mail_files(m_files): # load mail files
  179. mails = []
  180. for lc_i, mail_file in enumerate(m_files):
  181. if os.path.exists(mail_file):
  182. mail_data = load_mail_file(mail_file)
  183. print "read mail json file:", mail_file, ", mail number = ", len(mail_data)
  184. mails.extend(mail_data)
  185. print "read total mail number = ", len(mails)
  186. return mails
  187. def remove_dumplicate(mails): # remove duplicate mail according "subject" and "body"
  188. mails_md5 = np.empty(len(mails), dtype='|S32')
  189. for lc_i, lc_mail in enumerate(mails):
  190. if len(lc_mail['body']) < 100:
  191. mails_md5[lc_i] = hashlib.md5(("%s %s" % (lc_mail['subj'], lc_mail['body'])).encode('utf8')).hexdigest()
  192. else:
  193. mails_md5[lc_i] = hashlib.md5(lc_mail['body'].encode('utf8')).hexdigest()
  194. mail_datas[lc_i]['md5'] = mails_md5[lc_i]
  195. _, unique_idx = np.unique(mails_md5, return_index=True)
  196. unique_mail_datas = [mail_datas[idx] for idx in unique_idx]
  197. return unique_mail_datas
  198. def remove_dumplicate_v2(mails): # remove duplicate by simhash
  199. from module.simhash import SimhashBucket
  200. dup_bucket = SimhashBucket()
  201. unique_mail_datas = []
  202. for lc_i, lc_mail in enumerate(mails):
  203. mails[lc_i]['md5'] = hashlib.md5(("%s %s" % (lc_mail['subj'], lc_mail['body'])).encode('utf8')).hexdigest()
  204. if not dup_bucket.has_duplicate(lc_mail):
  205. dup_bucket.add(lc_mail)
  206. unique_mail_datas.append(lc_mail)
  207. if lc_i % 100 == 0:
  208. print lc_i,
  209. print
  210. return unique_mail_datas
  211. def filter_by_cluster(mails):
  212. from sklearn.cluster import KMeans
  213. from sklearn.feature_extraction.text import TfidfVectorizer
  214. from mail_preprocess import parallel_preprocess
  215. from scipy.sparse import csr_matrix
  216. print "parallel preprocess"
  217. parallel_preprocess(mails)
  218. mail_text = ["%s %s" % (mail['subj_tok'], mail['body_tok']) for mail in mails]
  219. print "tf idf vectorizer"
  220. mail_feat = TfidfVectorizer().fit_transform(mail_text).tocsr()
  221. print "kmeans cluster"
  222. fit_num = min(100000, mail_feat.shape[0])
  223. fit_indics = np.random.choice(mail_feat.shape[0], fit_num, replace=False)
  224. fit_feat = mail_feat[fit_indics, :]
  225. model = KMeans(n_clusters=100, max_iter=200, verbose=1, n_jobs=MailGlobal.etc_map['n_jobs'])
  226. model.fit(fit_feat)
  227. pred_labels = model.predict(mail_feat)
  228. uniq_labels = np.unique(pred_labels)
  229. sample_mails = []
  230. for label in uniq_labels:
  231. label_idx = np.where(pred_labels == label)[0]
  232. # choice_num = min(100, len(label_idx))
  233. choice_num = int(len(label_idx) * 0.1)
  234. select_idx = np.random.choice(label_idx, choice_num, replace=False)
  235. map(lambda x: sample_mails.append(mails[x]), select_idx)
  236. return sample_mails
  237. def filter_by_NMF(mails):
  238. from sklearn.decomposition import NMF
  239. from sklearn.feature_extraction.text import TfidfVectorizer
  240. from mail_preprocess import parallel_preprocess
  241. print "parallel preprocess"
  242. parallel_preprocess(mails)
  243. mail_text = ["%s %s" % (mail['subj_tok'], mail['body_tok']) for mail in mails]
  244. print "tf idf vectorizer"
  245. mail_feat = TfidfVectorizer().fit_transform(mail_text)
  246. print "nmf model"
  247. mail_feat = NMF(n_components=1000, init='random', max_iter=200, random_state=0).fit_transform(mail_feat)
  248. pred_labels = np.argmax(mail_feat, axis=1)
  249. uniq_labels = np.unique(pred_labels)
  250. sample_mails = []
  251. for label in uniq_labels:
  252. label_idx = np.where(pred_labels == label)[0]
  253. choice_num = int(len(label_idx) * 0.1)
  254. select_idx = np.random.choice(label_idx, choice_num, replace=False)
  255. map(lambda x: sample_mails.append(mails[x]), select_idx)
  256. return sample_mails
  257. def find_spam_address(mails):
  258. # find spam email according by alarm of "mailadmin@alarm.360.cn"
  259. lc_spam_set = set()
  260. email_reg = re.compile('[\w\.-]+@[\w\.-]+')
  261. spam_words = [u'88全讯网']
  262. for lc_mail in mails:
  263. if type(lc_mail['from']) != dict:
  264. continue
  265. if lc_mail['from']['a'] == "mailadmin@alarm.360.cn":
  266. # extract email format data from mail body
  267. spams = email_reg.findall(lc_mail['body'])
  268. # print "\n".join(spams)
  269. map(lambda x: lc_spam_set.add(x), spams)
  270. else:
  271. for word in spam_words:
  272. if lc_mail['subj'].find(word) >= 0 or lc_mail['body'].find(word) >= 0:
  273. lc_spam_set.add(lc_mail['from']['a'])
  274. # filter white dns address from spam list
  275. spam_set2 = set()
  276. for addr in lc_spam_set:
  277. if addr.endswith("360.cn") or addr.endswith("qihoo.net"):
  278. continue
  279. spam_set2.add(addr)
  280. with open("%s/spam_addr_list.txt" % MailGlobal.data_dir, 'wb') as spam_writer:
  281. spam_writer.write("\n".join(spam_set2))
  282. print "finish write spam file"
  283. lc_spam_set = spam_set2
  284. return lc_spam_set
  285. def filter_ignore_mails(mails): # ignore mails in ignore list
  286. ignore_list = set([u'mailadmin@alarm.360.cn'])
  287. remain_mail_datas = []
  288. for local_mail in mails:
  289. if type(local_mail['from']) != dict or local_mail['from']['a'] not in ignore_list:
  290. remain_mail_datas.append(local_mail)
  291. return remain_mail_datas
  292. def filter_ignore_mails_v1(mails): # ignore empty mails
  293. remain_mail_datas = []
  294. for local_mail in mails:
  295. if local_mail['body'].strip() == "":
  296. remain_mail_datas.append(local_mail)
  297. return remain_mail_datas
  298. mail_datas = load_mail_files(mail_files)
  299. mail_datas = remove_dumplicate(mail_datas)
  300. print "after remove_dumplicate, left mails", len(mail_datas)
  301. # mail_datas = remove_dumplicate_v2(mail_datas)
  302. # print "after remove_dumplicate_v2, left mails", len(mail_datas)
  303. spam_set = find_spam_address(mail_datas)
  304. mail_datas = filter_ignore_mails(mail_datas)
  305. print "after filter_ignore_mails, left mails", len(mail_datas)
  306. # choose samples by cluster
  307. # mail_datas = filter_by_cluster(mail_datas)
  308. # mail_datas = filter_by_NMF(mail_datas)
  309. # print "after filter_by_cluster, left mails", len(mail_datas)
  310. # print "="*20, "spam address list", "="*20
  311. # print "\n".join(spam_set2)
  312. # collection mails
  313. def score_mail_by_rule(mails):
  314. batch_mails = {lang_cn: [], lang_en: []}
  315. scores_cn = []
  316. spam_num, not_spam_num = 0, 0
  317. for mail in mails:
  318. lang = check_language(mail)
  319. if mail['from']['a'] in spam_set: # when address in black list
  320. mail['label'] = 1
  321. elif MailGlobal.reply_parser.is_reply(mail): # if is reply email
  322. mail['label'] = 0
  323. elif lang == lang_cn:
  324. score = cn_rule_model.score_mail_by_rule(mail)
  325. scores_cn.append(score)
  326. if lang == lang_cn and 'label' in mail.m:
  327. if mail['label'] == 1:
  328. spam_num += 1
  329. else:
  330. not_spam_num += 1
  331. if lang == lang_cn or (lang == lang_en and 'label' in mail.m):
  332. batch_mails[lang].append(mail)
  333. # max min scaler
  334. scores_cn = max_min_scale(scores_cn)
  335. step_j = 0
  336. for i in xrange(len(batch_mails[lang_cn])):
  337. if 'label' not in batch_mails[lang_cn][i].m:
  338. batch_mails[lang_cn][i]['sc'] = scores_cn[step_j]
  339. step_j += 1
  340. print "cn email=", len(batch_mails[lang_cn]), ", spam_email=", spam_num, ", not_spam_email=", not_spam_num
  341. print "en email=", len(batch_mails[lang_en])
  342. return batch_mails
  343. def score_mail_by_clf(mails):
  344. from mail_entity import MailSet
  345. from mail_preprocess import parallel_preprocess
  346. tok_mails = parallel_preprocess(mails)
  347. batch_mails = {lang_cn: [], lang_en: []}
  348. for mail in tok_mails:
  349. lang = check_language(mail)
  350. batch_mails[lang].append(mail)
  351. mail_sets_cn = MailSet(batch_mails[lang_cn])
  352. # mail_sets_en = MailSet(batch_mails[lang_en])
  353. prob_y_cn = MailGlobal.spam_clf.predict(mail_sets_cn, prob=True, lang=lang_cn)
  354. for idx in xrange(len(prob_y_cn)):
  355. batch_mails[lang_cn][idx]['sc'] = prob_y_cn[idx]
  356. batch_mails[lang_cn].sort(key=lambda x: x['sc'], cmp=lambda x, y: cmp(float(x), float(y)), reverse=True)
  357. batch_mails[lang_cn] = batch_mails[lang_cn][: int(len(batch_mails[lang_cn]) * 0.5)]
  358. return batch_mails
  359. # batch_mails = score_mail_by_rule(mail_datas)
  360. batch_mails = score_mail_by_clf(mail_datas)
  361. print "after clf, remain mails=", len(batch_mails[lang_cn])
  362. batch_mails[lang_cn] = filter_by_cluster(batch_mails[lang_cn])
  363. with open("%s_cn.json" % output_prefix, 'wb') as m_writer:
  364. for mail in batch_mails[lang_cn]:
  365. json_mail = mail.m
  366. m_writer.write("%s\n" % json.dumps(json_mail))
  367. with open("%s_en.json" % output_prefix, 'wb') as m_writer:
  368. for mail in batch_mails[lang_en]:
  369. m_writer.write("%s\n" % json.dumps(mail.m))
  370. def label_mails(mail_file, output_map, output_data, lang):
  371. """
  372. :param mail_file: mail file
  373. :param output_map: file name to store map {mail_md5, mail_labels}
  374. :param output_data: file name to store labeled json mails.
  375. :param max_labeled_mail:
  376. :param lang: mail language
  377. :return:
  378. """
  379. from mail_io import load_mail_file
  380. from mail_preprocess import mail_print
  381. mail_datas = load_mail_file(mail_file, raw_type='dict')
  382. if os.path.exists(output_map):
  383. mail_labels_line = open(output_map, 'rb').readlines()
  384. mail_labels = {}
  385. for unit in mail_labels_line:
  386. unit = unit.split(',')
  387. mail_labels[unit[0].strip()] = int(unit[1]) # (mail_md5, label)
  388. else:
  389. mail_labels = {}
  390. with open(output_map, 'a+') as mail_label_writer:
  391. labeled_data_writer = open(output_data, 'a+')
  392. # order "sc"
  393. score_ordered = {'sc': [], 'idx': []}
  394. for i, mail in enumerate(mail_datas):
  395. if 'sc' in mail.m:
  396. score_ordered['sc'].append(float(mail['sc']))
  397. score_ordered['idx'].append(i)
  398. if 'label' in mail.m and mail['md5'] not in mail_labels: # write labeled mail
  399. label = int(mail['label'])
  400. mail_labels[mail['md5']] = label
  401. mail_label_writer.write("%s, %s\n" % (mail['md5'], label))
  402. json_mail = mail.m
  403. json_mail['label'] = int(label)
  404. json_mail['lang'] = lang
  405. labeled_data_writer.write("%s\n" % json.dumps(json_mail))
  406. sort_idx = np.argsort(score_ordered['sc'])[::-1]
  407. score_ordered['idx'] = np.array(score_ordered['idx'])
  408. score_ordered['idx'] = score_ordered['idx'][sort_idx]
  409. for idx in score_ordered['idx']:
  410. mail = mail_datas[idx]
  411. if mail['md5'] in mail_labels:
  412. continue
  413. else: # label
  414. print "=" * 20, "labeled mail number: ", len(mail_labels), "/", len(score_ordered['idx']), "=" * 20
  415. mail_print(mail)
  416. while True:
  417. if 'sc' in mail.m:
  418. label = raw_input("mail score %.4f, is spam email ? [0, 1]: " % mail['sc'])
  419. else:
  420. label = raw_input("is spam email ? [0, 1]: ")
  421. if label == '0' or label == '1':
  422. break
  423. mail_labels[mail['md5']] = int(label)
  424. mail_label_writer.write("%s, %s\n" % (mail['md5'], label))
  425. json_mail = mail.m
  426. json_mail['label'] = int(label)
  427. json_mail['lang'] = lang
  428. labeled_data_writer.write("%s\n" % json.dumps(json_mail))
  429. print "finish label spam email"
  430. def analysis():
  431. # spam classifier
  432. sample_raw_text, sample_label = [], []
  433. # for mail in mail_datas:
  434. # if mail_labels.has_key(mail._md5):
  435. # sample_raw_text.append(mail._body)
  436. # sample_label.append(mail_labels[mail._md5])
  437. # sample_raw_text = np.array(sample_raw_text)
  438. # sample_label = np.array(sample_label)
  439. # shuf_indics = range(0, len(sample_raw_text))
  440. # random.shuffle(shuf_indics)
  441. # test_size = 0.1
  442. # split_pos = int(len(sample_raw_text) * test_size)
  443. # test_raw_text, test_label = sample_raw_text[:split_pos], sample_label[:split_pos]
  444. # test_feat = MailGlobal.feature.transform_body(test_raw_text)
  445. # print "test shape:", test_feat.shape
  446. # if not os.path.exists(MailGlobal.spam_clf._spam_model_file):
  447. # train_raw_text, train_label = sample_raw_text[split_pos:], sample_label[split_pos:]
  448. # train_feat = MailGlobal.feature.transform_body(train_raw_text)
  449. # print "train shape:", train_feat.shape
  450. # MailGlobal.spam_clf.fit(train_feat, train_label)
  451. # MailGlobal.spam_clf.save()
  452. # prob_y = MailGlobal.spam_clf.predict(test_feat, prob = True)
  453. # from sklearn.metrics import roc_auc_score
  454. # print "auc score: ", roc_auc_score(test_label, prob_y)