/mail_spam.py
Python | 500 lines | 467 code | 20 blank | 13 comment | 20 complexity | 24efc4ef22f448a86abd597ea3054f58 MD5 | raw file
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- # author: yaojian-xy <yaojian-xy@360.cn>
- import os
- import cPickle
- import json
- import re
- import numpy as np
- from sklearn.naive_bayes import MultinomialNB
- from acora import AcoraBuilder
- from mail_global import MailGlobal
- from mail_io import load_json_file
- from mail_util import SparseMatrixHandle
- class SpamMailClassifer:
- SPAM_YES = 1
- SPAM_NO = 0
- def __init__(self):
- self.spam_model_file = MailGlobal.etc_map['spam_model']
- if not os.path.exists(self.spam_model_file):
- self._model = {'cn': BayesClassifer(), 'en': BayesClassifer()}
- else:
- self._model = cPickle.load(open(self.spam_model_file, 'rb'))
- def dump(self):
- cPickle.dump(self._model, open(self.spam_model_file, 'wb'))
- def partial_fit(self, mail_sets, lang='cn'):
- if mail_sets.length() == 0:
- return
- subj_feat = MailGlobal.feature.transform(mail_sets['subj_tok'], field="subj", lang=lang)
- body_feat = MailGlobal.feature.transform(mail_sets['body_tok'], field="body", lang=lang)
- mail_feat = SparseMatrixHandle.csr_happend((subj_feat, body_feat))
- MailGlobal.logger.info('transform %s spam labeled mails %s' % (lang, str(mail_feat.shape)))
- self._model[lang].partial_fit(mail_feat, mail_sets['label'])
- def predict(self, mail_sets, prob=False, lang='cn'):
- if mail_sets.length() == 0:
- return []
- subj_feat = MailGlobal.feature.transform(mail_sets['subj_tok'], field="subj", lang=lang)
- body_feat = MailGlobal.feature.transform(mail_sets['body_tok'], field="body", lang=lang)
- mail_feat = SparseMatrixHandle.csr_happend((subj_feat, body_feat))
- prob_y = self._model[lang].predict(mail_feat, prob=prob)
- return prob_y
- class BayesClassifer:
- def __init__(self):
- self._model = MultinomialNB()
- self._classes = [SpamMailClassifer.SPAM_YES, SpamMailClassifer.SPAM_NO]
- def partial_fit(self, features, labels):
- self._model.partial_fit(features, labels, classes=self._classes)
- def predict(self, mail_feat, prob=False):
- if prob:
- prob_y = self._model.predict_proba(mail_feat)
- idx = np.argmax(prob_y, axis=1)
- return [(1 - prob_y[i][0]) if idx[i] == 0 else prob_y[i][1] for i in xrange(len(idx))]
- else:
- return self._model.predict(mail_feat)
- def cv(mail_sets, k_fold=5, lang='cn'):
- from sklearn.cross_validation import StratifiedKFold
- from sklearn.metrics import roc_auc_score
- if mail_sets.length() < k_fold:
- return
- mail_labels = mail_sets.df['label'].values
- # class sample count
- uniq_val, uniq_cnt = np.unique(mail_labels, return_counts=True)
- MailGlobal.logger.info("sample classes: %s, class count: %s" % (uniq_val, uniq_cnt,))
- # transform datas
- mail_subj_feat = MailGlobal.feature.transform(mail_sets.df['subj_tok'], field="subj", lang=lang)
- mail_body_feat = MailGlobal.feature.transform(mail_sets.df['body_tok'], field="body", lang=lang)
- mail_feat = SparseMatrixHandle.csr_happend((mail_subj_feat, mail_body_feat)).tocsr()
- # cross validation
- skf = StratifiedKFold(mail_labels, n_folds=k_fold)
- for i, (train_idx, test_idx) in enumerate(skf):
- train_X, train_y = mail_feat[train_idx], mail_labels[train_idx]
- test_X, test_y = mail_feat[test_idx], mail_labels[test_idx]
- MailGlobal.logger.info("cv_%d, train shape: %s, test shape: %s" % (i, train_X.shape, test_X.shape,))
- cv_model = BayesClassifer()
- cv_model.partial_fit(train_X, train_y)
- prob_y = cv_model.predict(test_X, prob=True)
- score = roc_auc_score(test_y, prob_y)
- MailGlobal.logger.info("auc score: %.8f" % score)
- class RuleBasedModel:
- def __init__(self, rule_file=None, alpha=0.5):
- if rule_file is None:
- rule_file = MailGlobal.etc_map['spam_rule_file']
- self.rule_file = rule_file
- self.alpha = alpha
- rule_data = load_json_file(rule_file)
- if rule_data is not None:
- self.regex_rules = RuleBasedModel.regex_rule_data(rule_data)
- else: # default value
- self.regex_rules = {'subj_kws': dict(), 'body_kws': dict(), 'subj_reg': None, 'body_reg': None}
- @staticmethod
- def regex_rule_data(rules_dict):
- """
- parse rule data to regex object
- :return:
- """
- # regex for subject and body
- regex_rules = dict()
- regex_rules['subj_kws'] = dict((obj['key'], obj['score']) for obj in rules_dict['subject_rule_kw'])
- regex_rules['body_kws'] = dict((obj['key'], obj['score']) for obj in rules_dict['body_rule_kw'])
- subj_builder = AcoraBuilder()
- body_builder = AcoraBuilder()
- map(lambda x: subj_builder.add(x), regex_rules['subj_kws'].keys())
- map(lambda x: body_builder.add(x), regex_rules['body_kws'].keys())
- regex_rules['subj_reg'] = subj_builder.build()
- regex_rules['body_reg'] = body_builder.build()
- # regex for white list and black list
- return regex_rules
- def score_mail_by_rule(self, mail):
- score_subj = 0.0
- if self.regex_rules['subj_reg'] is not None:
- for item in self.regex_rules['subj_reg'].findall(mail['subj']):
- key, count = item
- score_subj += self.regex_rules['subj_kws'][key] * count
- score_body = 0.0
- if self.regex_rules['body_reg'] is not None:
- for item in self.regex_rules['body_reg'].findall(mail['body']):
- key, count = item
- score_body += self.regex_rules['body_kws'][key] * count
- score = self.alpha * score_subj + (1 - self.alpha) * score_body
- return score
- def predict(self, mail, hits_score=None):
- """
- @:param hits_score: recognize as spam email when score larger than "hits_score"
- """
- score = self.score_mail_by_rule(mail)
- if hits_score is not None:
- return SpamMailClassifer.SPAM_YES if score > hits_score else SpamMailClassifer.SPAM_NO
- else:
- return score
- # chinese rule_based model
- cn_rule_model = RuleBasedModel(rule_file=MailGlobal.etc_map['spam_rule_file'])
- class BlendClassifer:
- """
- multiple factors considered, such as:
- rule-based : text keywords, url rule
- graph-based: email response style, like only send but not receive
- ml-based : corpus-based supervised machine learning
- blacklist : black list (dns or just email address)
- """
- def __init__(self, rule_file=MailGlobal.etc_map['spam_rule_file']):
- self.ml_clf = BayesClassifer()
- self.rules = cn_rule_model
- def fit(self, mail_feat, labels):
- self.ml_clf.fit(mail_feat, labels)
- def predict(self, mail, prob=False, smooth=0.00001):
- sender = mail['from']
- # recognize from blacklist
- sender_dns = sender[(sender.find('@') + 1):]
- if sender_dns in self.blacklist['dns'] or sender in self.blacklist['addr']:
- return SpamMailClassifer.SPAM_YES
- # recognize from graph net
- num_to_from, num_cc_from, num_in_attach = MailGlobal.graphnet.in_degree(sender)
- num_from_to, num_from_cc, num_out_attach = MailGlobal.graphnet.out_degree(sender)
- ratio_from = num_from_cc / (num_cc_from + smooth)
- ratio_attach = num_out_attach / (num_in_attach + smooth)
- # recognize from un-token text rules
- rule_score_subj = 0.0
- for key in self.rules['subj_reg'].findall(mail['subj']):
- rule_score_subj += self.rules['subj'][key]
- rule_score_body = 0.0
- for key in self.rules['body_reg'].findall(mail['body']):
- rule_score_body += self.rules['body'][key]
- # recognize from
- ml_score = self.ml_clf.predict(mail)
- # combine
- # TODO
- return 0
- def select_mails(mail_files, output_prefix):
- """
- :param mail_file:
- :param output_prefix:
- :return:
- """
- import hashlib
- from mail_util import max_min_scale
- from mail_io import load_mail_file
- from mail_preprocess import check_language
- lang_cn, lang_en = MailGlobal.MAIL_LANG_CN, MailGlobal.MAIL_LANG_EN
- def load_mail_files(m_files): # load mail files
- mails = []
- for lc_i, mail_file in enumerate(m_files):
- if os.path.exists(mail_file):
- mail_data = load_mail_file(mail_file)
- print "read mail json file:", mail_file, ", mail number = ", len(mail_data)
- mails.extend(mail_data)
- print "read total mail number = ", len(mails)
- return mails
- def remove_dumplicate(mails): # remove duplicate mail according "subject" and "body"
- mails_md5 = np.empty(len(mails), dtype='|S32')
- for lc_i, lc_mail in enumerate(mails):
- if len(lc_mail['body']) < 100:
- mails_md5[lc_i] = hashlib.md5(("%s %s" % (lc_mail['subj'], lc_mail['body'])).encode('utf8')).hexdigest()
- else:
- mails_md5[lc_i] = hashlib.md5(lc_mail['body'].encode('utf8')).hexdigest()
- mail_datas[lc_i]['md5'] = mails_md5[lc_i]
- _, unique_idx = np.unique(mails_md5, return_index=True)
- unique_mail_datas = [mail_datas[idx] for idx in unique_idx]
- return unique_mail_datas
- def remove_dumplicate_v2(mails): # remove duplicate by simhash
- from module.simhash import SimhashBucket
- dup_bucket = SimhashBucket()
- unique_mail_datas = []
- for lc_i, lc_mail in enumerate(mails):
- mails[lc_i]['md5'] = hashlib.md5(("%s %s" % (lc_mail['subj'], lc_mail['body'])).encode('utf8')).hexdigest()
- if not dup_bucket.has_duplicate(lc_mail):
- dup_bucket.add(lc_mail)
- unique_mail_datas.append(lc_mail)
- if lc_i % 100 == 0:
- print lc_i,
- print
- return unique_mail_datas
- def filter_by_cluster(mails):
- from sklearn.cluster import KMeans
- from sklearn.feature_extraction.text import TfidfVectorizer
- from mail_preprocess import parallel_preprocess
- from scipy.sparse import csr_matrix
- print "parallel preprocess"
- parallel_preprocess(mails)
- mail_text = ["%s %s" % (mail['subj_tok'], mail['body_tok']) for mail in mails]
- print "tf idf vectorizer"
- mail_feat = TfidfVectorizer().fit_transform(mail_text).tocsr()
- print "kmeans cluster"
- fit_num = min(100000, mail_feat.shape[0])
- fit_indics = np.random.choice(mail_feat.shape[0], fit_num, replace=False)
- fit_feat = mail_feat[fit_indics, :]
- model = KMeans(n_clusters=100, max_iter=200, verbose=1, n_jobs=MailGlobal.etc_map['n_jobs'])
- model.fit(fit_feat)
- pred_labels = model.predict(mail_feat)
- uniq_labels = np.unique(pred_labels)
- sample_mails = []
- for label in uniq_labels:
- label_idx = np.where(pred_labels == label)[0]
- # choice_num = min(100, len(label_idx))
- choice_num = int(len(label_idx) * 0.1)
- select_idx = np.random.choice(label_idx, choice_num, replace=False)
- map(lambda x: sample_mails.append(mails[x]), select_idx)
- return sample_mails
- def filter_by_NMF(mails):
- from sklearn.decomposition import NMF
- from sklearn.feature_extraction.text import TfidfVectorizer
- from mail_preprocess import parallel_preprocess
- print "parallel preprocess"
- parallel_preprocess(mails)
- mail_text = ["%s %s" % (mail['subj_tok'], mail['body_tok']) for mail in mails]
- print "tf idf vectorizer"
- mail_feat = TfidfVectorizer().fit_transform(mail_text)
- print "nmf model"
- mail_feat = NMF(n_components=1000, init='random', max_iter=200, random_state=0).fit_transform(mail_feat)
- pred_labels = np.argmax(mail_feat, axis=1)
- uniq_labels = np.unique(pred_labels)
- sample_mails = []
- for label in uniq_labels:
- label_idx = np.where(pred_labels == label)[0]
- choice_num = int(len(label_idx) * 0.1)
- select_idx = np.random.choice(label_idx, choice_num, replace=False)
- map(lambda x: sample_mails.append(mails[x]), select_idx)
- return sample_mails
- def find_spam_address(mails):
- # find spam email according by alarm of "mailadmin@alarm.360.cn"
- lc_spam_set = set()
- email_reg = re.compile('[\w\.-]+@[\w\.-]+')
- spam_words = [u'88全讯网']
- for lc_mail in mails:
- if type(lc_mail['from']) != dict:
- continue
- if lc_mail['from']['a'] == "mailadmin@alarm.360.cn":
- # extract email format data from mail body
- spams = email_reg.findall(lc_mail['body'])
- # print "\n".join(spams)
- map(lambda x: lc_spam_set.add(x), spams)
- else:
- for word in spam_words:
- if lc_mail['subj'].find(word) >= 0 or lc_mail['body'].find(word) >= 0:
- lc_spam_set.add(lc_mail['from']['a'])
- # filter white dns address from spam list
- spam_set2 = set()
- for addr in lc_spam_set:
- if addr.endswith("360.cn") or addr.endswith("qihoo.net"):
- continue
- spam_set2.add(addr)
- with open("%s/spam_addr_list.txt" % MailGlobal.data_dir, 'wb') as spam_writer:
- spam_writer.write("\n".join(spam_set2))
- print "finish write spam file"
- lc_spam_set = spam_set2
- return lc_spam_set
- def filter_ignore_mails(mails): # ignore mails in ignore list
- ignore_list = set([u'mailadmin@alarm.360.cn'])
- remain_mail_datas = []
- for local_mail in mails:
- if type(local_mail['from']) != dict or local_mail['from']['a'] not in ignore_list:
- remain_mail_datas.append(local_mail)
- return remain_mail_datas
- def filter_ignore_mails_v1(mails): # ignore empty mails
- remain_mail_datas = []
- for local_mail in mails:
- if local_mail['body'].strip() == "":
- remain_mail_datas.append(local_mail)
- return remain_mail_datas
- mail_datas = load_mail_files(mail_files)
- mail_datas = remove_dumplicate(mail_datas)
- print "after remove_dumplicate, left mails", len(mail_datas)
- # mail_datas = remove_dumplicate_v2(mail_datas)
- # print "after remove_dumplicate_v2, left mails", len(mail_datas)
- spam_set = find_spam_address(mail_datas)
- mail_datas = filter_ignore_mails(mail_datas)
- print "after filter_ignore_mails, left mails", len(mail_datas)
- # choose samples by cluster
- # mail_datas = filter_by_cluster(mail_datas)
- # mail_datas = filter_by_NMF(mail_datas)
- # print "after filter_by_cluster, left mails", len(mail_datas)
- # print "="*20, "spam address list", "="*20
- # print "\n".join(spam_set2)
- # collection mails
- def score_mail_by_rule(mails):
- batch_mails = {lang_cn: [], lang_en: []}
- scores_cn = []
- spam_num, not_spam_num = 0, 0
- for mail in mails:
- lang = check_language(mail)
- if mail['from']['a'] in spam_set: # when address in black list
- mail['label'] = 1
- elif MailGlobal.reply_parser.is_reply(mail): # if is reply email
- mail['label'] = 0
- elif lang == lang_cn:
- score = cn_rule_model.score_mail_by_rule(mail)
- scores_cn.append(score)
- if lang == lang_cn and 'label' in mail.m:
- if mail['label'] == 1:
- spam_num += 1
- else:
- not_spam_num += 1
- if lang == lang_cn or (lang == lang_en and 'label' in mail.m):
- batch_mails[lang].append(mail)
- # max min scaler
- scores_cn = max_min_scale(scores_cn)
- step_j = 0
- for i in xrange(len(batch_mails[lang_cn])):
- if 'label' not in batch_mails[lang_cn][i].m:
- batch_mails[lang_cn][i]['sc'] = scores_cn[step_j]
- step_j += 1
- print "cn email=", len(batch_mails[lang_cn]), ", spam_email=", spam_num, ", not_spam_email=", not_spam_num
- print "en email=", len(batch_mails[lang_en])
- return batch_mails
- def score_mail_by_clf(mails):
- from mail_entity import MailSet
- from mail_preprocess import parallel_preprocess
- tok_mails = parallel_preprocess(mails)
- batch_mails = {lang_cn: [], lang_en: []}
- for mail in tok_mails:
- lang = check_language(mail)
- batch_mails[lang].append(mail)
- mail_sets_cn = MailSet(batch_mails[lang_cn])
- # mail_sets_en = MailSet(batch_mails[lang_en])
- prob_y_cn = MailGlobal.spam_clf.predict(mail_sets_cn, prob=True, lang=lang_cn)
- for idx in xrange(len(prob_y_cn)):
- batch_mails[lang_cn][idx]['sc'] = prob_y_cn[idx]
- batch_mails[lang_cn].sort(key=lambda x: x['sc'], cmp=lambda x, y: cmp(float(x), float(y)), reverse=True)
- batch_mails[lang_cn] = batch_mails[lang_cn][: int(len(batch_mails[lang_cn]) * 0.5)]
- return batch_mails
- # batch_mails = score_mail_by_rule(mail_datas)
- batch_mails = score_mail_by_clf(mail_datas)
- print "after clf, remain mails=", len(batch_mails[lang_cn])
- batch_mails[lang_cn] = filter_by_cluster(batch_mails[lang_cn])
- with open("%s_cn.json" % output_prefix, 'wb') as m_writer:
- for mail in batch_mails[lang_cn]:
- json_mail = mail.m
- m_writer.write("%s\n" % json.dumps(json_mail))
- with open("%s_en.json" % output_prefix, 'wb') as m_writer:
- for mail in batch_mails[lang_en]:
- m_writer.write("%s\n" % json.dumps(mail.m))
- def label_mails(mail_file, output_map, output_data, lang):
- """
- :param mail_file: mail file
- :param output_map: file name to store map {mail_md5, mail_labels}
- :param output_data: file name to store labeled json mails.
- :param max_labeled_mail:
- :param lang: mail language
- :return:
- """
- from mail_io import load_mail_file
- from mail_preprocess import mail_print
- mail_datas = load_mail_file(mail_file, raw_type='dict')
- if os.path.exists(output_map):
- mail_labels_line = open(output_map, 'rb').readlines()
- mail_labels = {}
- for unit in mail_labels_line:
- unit = unit.split(',')
- mail_labels[unit[0].strip()] = int(unit[1]) # (mail_md5, label)
- else:
- mail_labels = {}
- with open(output_map, 'a+') as mail_label_writer:
- labeled_data_writer = open(output_data, 'a+')
- # order "sc"
- score_ordered = {'sc': [], 'idx': []}
- for i, mail in enumerate(mail_datas):
- if 'sc' in mail.m:
- score_ordered['sc'].append(float(mail['sc']))
- score_ordered['idx'].append(i)
- if 'label' in mail.m and mail['md5'] not in mail_labels: # write labeled mail
- label = int(mail['label'])
- mail_labels[mail['md5']] = label
- mail_label_writer.write("%s, %s\n" % (mail['md5'], label))
- json_mail = mail.m
- json_mail['label'] = int(label)
- json_mail['lang'] = lang
- labeled_data_writer.write("%s\n" % json.dumps(json_mail))
- sort_idx = np.argsort(score_ordered['sc'])[::-1]
- score_ordered['idx'] = np.array(score_ordered['idx'])
- score_ordered['idx'] = score_ordered['idx'][sort_idx]
- for idx in score_ordered['idx']:
- mail = mail_datas[idx]
- if mail['md5'] in mail_labels:
- continue
- else: # label
- print "=" * 20, "labeled mail number: ", len(mail_labels), "/", len(score_ordered['idx']), "=" * 20
- mail_print(mail)
- while True:
- if 'sc' in mail.m:
- label = raw_input("mail score %.4f, is spam email ? [0, 1]: " % mail['sc'])
- else:
- label = raw_input("is spam email ? [0, 1]: ")
- if label == '0' or label == '1':
- break
- mail_labels[mail['md5']] = int(label)
- mail_label_writer.write("%s, %s\n" % (mail['md5'], label))
- json_mail = mail.m
- json_mail['label'] = int(label)
- json_mail['lang'] = lang
- labeled_data_writer.write("%s\n" % json.dumps(json_mail))
- print "finish label spam email"
- def analysis():
- # spam classifier
- sample_raw_text, sample_label = [], []
- # for mail in mail_datas:
- # if mail_labels.has_key(mail._md5):
- # sample_raw_text.append(mail._body)
- # sample_label.append(mail_labels[mail._md5])
- # sample_raw_text = np.array(sample_raw_text)
- # sample_label = np.array(sample_label)
- # shuf_indics = range(0, len(sample_raw_text))
- # random.shuffle(shuf_indics)
- # test_size = 0.1
- # split_pos = int(len(sample_raw_text) * test_size)
- # test_raw_text, test_label = sample_raw_text[:split_pos], sample_label[:split_pos]
- # test_feat = MailGlobal.feature.transform_body(test_raw_text)
- # print "test shape:", test_feat.shape
- # if not os.path.exists(MailGlobal.spam_clf._spam_model_file):
- # train_raw_text, train_label = sample_raw_text[split_pos:], sample_label[split_pos:]
- # train_feat = MailGlobal.feature.transform_body(train_raw_text)
- # print "train shape:", train_feat.shape
- # MailGlobal.spam_clf.fit(train_feat, train_label)
- # MailGlobal.spam_clf.save()
- # prob_y = MailGlobal.spam_clf.predict(test_feat, prob = True)
- # from sklearn.metrics import roc_auc_score
- # print "auc score: ", roc_auc_score(test_label, prob_y)