PageRenderTime 60ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/unladen_swallow/lib/spambayes/spambayes/tokenizer.py

https://bitbucket.org/cfbolz/benchmarks-pypy-phd
Python | 1736 lines | 1007 code | 40 blank | 689 comment | 37 complexity | e1362ff259d2c49fde936ebc1ae05773 MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. #! /usr/bin/env python
  2. """Module to tokenize email messages for spam filtering."""
  3. from __future__ import generators
  4. import email
  5. import email.Message
  6. import email.Header
  7. import email.Utils
  8. import email.Errors
  9. import re
  10. import math
  11. import os
  12. import binascii
  13. import urlparse
  14. import urllib
  15. from spambayes import classifier
  16. from spambayes.Options import options
  17. from spambayes.mboxutils import get_message
  18. try:
  19. from spambayes import dnscache
  20. cache = dnscache.cache(cachefile=options["Tokenizer", "lookup_ip_cache"])
  21. cache.printStatsAtEnd = False
  22. except (IOError, ImportError):
  23. class cache:
  24. @staticmethod
  25. def lookup(*args):
  26. return []
  27. else:
  28. import atexit
  29. atexit.register(cache.close)
  30. # Patch encodings.aliases to recognize 'ansi_x3_4_1968'
  31. from encodings.aliases import aliases # The aliases dictionary
  32. if not aliases.has_key('ansi_x3_4_1968'):
  33. aliases['ansi_x3_4_1968'] = 'ascii'
  34. del aliases # Not needed any more
  35. ##############################################################################
  36. # To fold case or not to fold case? I didn't want to fold case, because
  37. # it hides information in English, and I have no idea what .lower() does
  38. # to other languages; and, indeed, 'FREE' (all caps) turned out to be one
  39. # of the strongest spam indicators in my content-only tests (== one with
  40. # prob 0.99 *and* made it into spamprob's nbest list very often).
  41. #
  42. # Against preservering case, it makes the database size larger, and requires
  43. # more training data to get enough "representative" mixed-case examples.
  44. #
  45. # Running my c.l.py tests didn't support my intuition that case was
  46. # valuable, so it's getting folded away now. Folding or not made no
  47. # significant difference to the false positive rate, and folding made a
  48. # small (but statistically significant all the same) reduction in the
  49. # false negative rate. There is one obvious difference: after folding
  50. # case, conference announcements no longer got high spam scores. Their
  51. # content was usually fine, but they were highly penalized for VISIT OUR
  52. # WEBSITE FOR MORE INFORMATION! kinds of repeated SCREAMING. That is
  53. # indeed the language of advertising, and I halfway regret that folding
  54. # away case no longer picks on them.
  55. #
  56. # Since the f-p rate didn't change, but conference announcements escaped
  57. # that category, something else took their place. It seems to be highly
  58. # off-topic messages, like debates about Microsoft's place in the world.
  59. # Talk about "money" and "lucrative" is indistinguishable now from talk
  60. # about "MONEY" and "LUCRATIVE", and spam mentions MONEY a lot.
  61. ##############################################################################
  62. # Character n-grams or words?
  63. #
  64. # With careful multiple-corpora c.l.py tests sticking to case-folded decoded
  65. # text-only portions, and ignoring headers, and with identical special
  66. # parsing & tagging of embedded URLs:
  67. #
  68. # Character 3-grams gave 5x as many false positives as split-on-whitespace
  69. # (s-o-w). The f-n rate was also significantly worse, but within a factor
  70. # of 2. So character 3-grams lost across the board.
  71. #
  72. # Character 5-grams gave 32% more f-ps than split-on-whitespace, but the
  73. # s-o-w fp rate across 20,000 presumed-hams was 0.1%, and this is the
  74. # difference between 23 and 34 f-ps. There aren't enough there to say that's
  75. # significnatly more with killer-high confidence. There were plenty of f-ns,
  76. # though, and the f-n rate with character 5-grams was substantially *worse*
  77. # than with character 3-grams (which in turn was substantially worse than
  78. # with s-o-w).
  79. #
  80. # Training on character 5-grams creates many more unique tokens than s-o-w:
  81. # a typical run bloated to 150MB process size. It also ran a lot slower than
  82. # s-o-w, partly related to heavy indexing of a huge out-of-cache wordinfo
  83. # dict. I rarely noticed disk activity when running s-o-w, so rarely bothered
  84. # to look at process size; it was under 30MB last time I looked.
  85. #
  86. # Figuring out *why* a msg scored as it did proved much more mysterious when
  87. # working with character n-grams: they often had no obvious "meaning". In
  88. # contrast, it was always easy to figure out what s-o-w was picking up on.
  89. # 5-grams flagged a msg from Christian Tismer as spam, where he was discussing
  90. # the speed of tasklets under his new implementation of stackless:
  91. #
  92. # prob = 0.99999998959
  93. # prob('ed sw') = 0.01
  94. # prob('http0:pgp') = 0.01
  95. # prob('http0:python') = 0.01
  96. # prob('hlon ') = 0.99
  97. # prob('http0:wwwkeys') = 0.01
  98. # prob('http0:starship') = 0.01
  99. # prob('http0:stackless') = 0.01
  100. # prob('n xp ') = 0.99
  101. # prob('on xp') = 0.99
  102. # prob('p 150') = 0.99
  103. # prob('lon x') = 0.99
  104. # prob(' amd ') = 0.99
  105. # prob(' xp 1') = 0.99
  106. # prob(' athl') = 0.99
  107. # prob('1500+') = 0.99
  108. # prob('xp 15') = 0.99
  109. #
  110. # The spam decision was baffling until I realized that *all* the high-
  111. # probablity spam 5-grams there came out of a single phrase:
  112. #
  113. # AMD Athlon XP 1500+
  114. #
  115. # So Christian was punished for using a machine lots of spam tries to sell
  116. # <wink>. In a classic Bayesian classifier, this probably wouldn't have
  117. # mattered, but Graham's throws away almost all the 5-grams from a msg,
  118. # saving only the about-a-dozen farthest from a neutral 0.5. So one bad
  119. # phrase can kill you! This appears to happen very rarely, but happened
  120. # more than once.
  121. #
  122. # The conclusion is that character n-grams have almost nothing to recommend
  123. # them under Graham's scheme: harder to work with, slower, much larger
  124. # database, worse results, and prone to rare mysterious disasters.
  125. #
  126. # There's one area they won hands-down: detecting spam in what I assume are
  127. # Asian languages. The s-o-w scheme sometimes finds only line-ends to split
  128. # on then, and then a "hey, this 'word' is way too big! let's ignore it"
  129. # gimmick kicks in, and produces no tokens at all.
  130. #
  131. # [Later: we produce character 5-grams then under the s-o-w scheme, instead
  132. # ignoring the blob, but only if there are high-bit characters in the blob;
  133. # e.g., there's no point 5-gramming uuencoded lines, and doing so would
  134. # bloat the database size.]
  135. #
  136. # Interesting: despite that odd example above, the *kinds* of f-p mistakes
  137. # 5-grams made were very much like s-o-w made -- I recognized almost all of
  138. # the 5-gram f-p messages from previous s-o-w runs. For example, both
  139. # schemes have a particular hatred for conference announcements, although
  140. # s-o-w stopped hating them after folding case. But 5-grams still hate them.
  141. # Both schemes also hate msgs discussing HTML with examples, with about equal
  142. # passion. Both schemes hate brief "please subscribe [unsubscribe] me"
  143. # msgs, although 5-grams seems to hate them more.
  144. ##############################################################################
  145. # How to tokenize?
  146. #
  147. # I started with string.split() merely for speed. Over time I realized it
  148. # was making interesting context distinctions qualitatively akin to n-gram
  149. # schemes; e.g., "free!!" is a much stronger spam indicator than "free". But
  150. # unlike n-grams (whether word- or character- based) under Graham's scoring
  151. # scheme, this mild context dependence never seems to go over the edge in
  152. # giving "too much" credence to an unlucky phrase.
  153. #
  154. # OTOH, compared to "searching for words", it increases the size of the
  155. # database substantially, less than but close to a factor of 2. This is very
  156. # much less than a word bigram scheme bloats it, but as always an increase
  157. # isn't justified unless the results are better.
  158. #
  159. # Following are stats comparing
  160. #
  161. # for token in text.split(): # left column
  162. #
  163. # to
  164. #
  165. # for token in re.findall(r"[\w$\-\x80-\xff]+", text): # right column
  166. #
  167. # text is case-normalized (text.lower()) in both cases, and the runs were
  168. # identical in all other respects. The results clearly favor the split()
  169. # gimmick, although they vaguely suggest that some sort of compromise
  170. # may do as well with less database burden; e.g., *perhaps* folding runs of
  171. # "punctuation" characters into a canonical representative could do that.
  172. # But the database size is reasonable without that, and plain split() avoids
  173. # having to worry about how to "fold punctuation" in languages other than
  174. # English.
  175. #
  176. # false positive percentages
  177. # 0.000 0.000 tied
  178. # 0.000 0.050 lost
  179. # 0.050 0.150 lost
  180. # 0.000 0.025 lost
  181. # 0.025 0.050 lost
  182. # 0.025 0.075 lost
  183. # 0.050 0.150 lost
  184. # 0.025 0.000 won
  185. # 0.025 0.075 lost
  186. # 0.000 0.025 lost
  187. # 0.075 0.150 lost
  188. # 0.050 0.050 tied
  189. # 0.025 0.050 lost
  190. # 0.000 0.025 lost
  191. # 0.050 0.025 won
  192. # 0.025 0.000 won
  193. # 0.025 0.025 tied
  194. # 0.000 0.025 lost
  195. # 0.025 0.075 lost
  196. # 0.050 0.175 lost
  197. #
  198. # won 3 times
  199. # tied 3 times
  200. # lost 14 times
  201. #
  202. # total unique fp went from 8 to 20
  203. #
  204. # false negative percentages
  205. # 0.945 1.200 lost
  206. # 0.836 1.018 lost
  207. # 1.200 1.200 tied
  208. # 1.418 1.636 lost
  209. # 1.455 1.418 won
  210. # 1.091 1.309 lost
  211. # 1.091 1.272 lost
  212. # 1.236 1.563 lost
  213. # 1.564 1.855 lost
  214. # 1.236 1.491 lost
  215. # 1.563 1.599 lost
  216. # 1.563 1.781 lost
  217. # 1.236 1.709 lost
  218. # 0.836 0.982 lost
  219. # 0.873 1.382 lost
  220. # 1.236 1.527 lost
  221. # 1.273 1.418 lost
  222. # 1.018 1.273 lost
  223. # 1.091 1.091 tied
  224. # 1.490 1.454 won
  225. #
  226. # won 2 times
  227. # tied 2 times
  228. # lost 16 times
  229. #
  230. # total unique fn went from 292 to 302
  231. #
  232. # Later: Here's another tokenization scheme with more promise.
  233. #
  234. # fold case, ignore punctuation, strip a trailing 's' from words (to
  235. # stop Guido griping about "hotel" and "hotels" getting scored as
  236. # distinct clues <wink>) and save both word bigrams and word unigrams
  237. #
  238. # This was the code:
  239. #
  240. # # Tokenize everything in the body.
  241. # lastw = ''
  242. # for w in word_re.findall(text):
  243. # n = len(w)
  244. # # Make sure this range matches in tokenize_word().
  245. # if 3 <= n <= 12:
  246. # if w[-1] == 's':
  247. # w = w[:-1]
  248. # yield w
  249. # if lastw:
  250. # yield lastw + w
  251. # lastw = w + ' '
  252. #
  253. # elif n >= 3:
  254. # lastw = ''
  255. # for t in tokenize_word(w):
  256. # yield t
  257. #
  258. # where
  259. #
  260. # word_re = re.compile(r"[\w$\-\x80-\xff]+")
  261. #
  262. # This at least doubled the process size. It helped the f-n rate
  263. # significantly, but probably hurt the f-p rate (the f-p rate is too low
  264. # with only 4000 hams per run to be confident about changes of such small
  265. # *absolute* magnitude -- 0.025% is a single message in the f-p table):
  266. #
  267. # false positive percentages
  268. # 0.000 0.000 tied
  269. # 0.000 0.075 lost +(was 0)
  270. # 0.050 0.125 lost +150.00%
  271. # 0.025 0.000 won -100.00%
  272. # 0.075 0.025 won -66.67%
  273. # 0.000 0.050 lost +(was 0)
  274. # 0.100 0.175 lost +75.00%
  275. # 0.050 0.050 tied
  276. # 0.025 0.050 lost +100.00%
  277. # 0.025 0.000 won -100.00%
  278. # 0.050 0.125 lost +150.00%
  279. # 0.050 0.025 won -50.00%
  280. # 0.050 0.050 tied
  281. # 0.000 0.025 lost +(was 0)
  282. # 0.000 0.025 lost +(was 0)
  283. # 0.075 0.050 won -33.33%
  284. # 0.025 0.050 lost +100.00%
  285. # 0.000 0.000 tied
  286. # 0.025 0.100 lost +300.00%
  287. # 0.050 0.150 lost +200.00%
  288. #
  289. # won 5 times
  290. # tied 4 times
  291. # lost 11 times
  292. #
  293. # total unique fp went from 13 to 21
  294. #
  295. # false negative percentages
  296. # 0.327 0.218 won -33.33%
  297. # 0.400 0.218 won -45.50%
  298. # 0.327 0.218 won -33.33%
  299. # 0.691 0.691 tied
  300. # 0.545 0.327 won -40.00%
  301. # 0.291 0.218 won -25.09%
  302. # 0.218 0.291 lost +33.49%
  303. # 0.654 0.473 won -27.68%
  304. # 0.364 0.327 won -10.16%
  305. # 0.291 0.182 won -37.46%
  306. # 0.327 0.254 won -22.32%
  307. # 0.691 0.509 won -26.34%
  308. # 0.582 0.473 won -18.73%
  309. # 0.291 0.255 won -12.37%
  310. # 0.364 0.218 won -40.11%
  311. # 0.436 0.327 won -25.00%
  312. # 0.436 0.473 lost +8.49%
  313. # 0.218 0.218 tied
  314. # 0.291 0.255 won -12.37%
  315. # 0.254 0.364 lost +43.31%
  316. #
  317. # won 15 times
  318. # tied 2 times
  319. # lost 3 times
  320. #
  321. # total unique fn went from 106 to 94
  322. ##############################################################################
  323. # What about HTML?
  324. #
  325. # Computer geeks seem to view use of HTML in mailing lists and newsgroups as
  326. # a mortal sin. Normal people don't, but so it goes: in a technical list/
  327. # group, every HTML decoration has spamprob 0.99, there are lots of unique
  328. # HTML decorations, and lots of them appear at the very start of the message
  329. # so that Graham's scoring scheme latches on to them tight. As a result,
  330. # any plain text message just containing an HTML example is likely to be
  331. # judged spam (every HTML decoration is an extreme).
  332. #
  333. # So if a message is multipart/alternative with both text/plain and text/html
  334. # branches, we ignore the latter, else newbies would never get a message
  335. # through. If a message is just HTML, it has virtually no chance of getting
  336. # through.
  337. #
  338. # In an effort to let normal people use mailing lists too <wink>, and to
  339. # alleviate the woes of messages merely *discussing* HTML practice, I
  340. # added a gimmick to strip HTML tags after case-normalization and after
  341. # special tagging of embedded URLs. This consisted of a regexp sub pattern,
  342. # where instances got replaced by single blanks:
  343. #
  344. # html_re = re.compile(r"""
  345. # <
  346. # [^\s<>] # e.g., don't match 'a < b' or '<<<' or 'i << 5' or 'a<>b'
  347. # [^>]{0,128} # search for the end '>', but don't chew up the world
  348. # >
  349. # """, re.VERBOSE)
  350. #
  351. # and then
  352. #
  353. # text = html_re.sub(' ', text)
  354. #
  355. # Alas, little good came of this:
  356. #
  357. # false positive percentages
  358. # 0.000 0.000 tied
  359. # 0.000 0.000 tied
  360. # 0.050 0.075 lost
  361. # 0.000 0.000 tied
  362. # 0.025 0.025 tied
  363. # 0.025 0.025 tied
  364. # 0.050 0.050 tied
  365. # 0.025 0.025 tied
  366. # 0.025 0.025 tied
  367. # 0.000 0.050 lost
  368. # 0.075 0.100 lost
  369. # 0.050 0.050 tied
  370. # 0.025 0.025 tied
  371. # 0.000 0.025 lost
  372. # 0.050 0.050 tied
  373. # 0.025 0.025 tied
  374. # 0.025 0.025 tied
  375. # 0.000 0.000 tied
  376. # 0.025 0.050 lost
  377. # 0.050 0.050 tied
  378. #
  379. # won 0 times
  380. # tied 15 times
  381. # lost 5 times
  382. #
  383. # total unique fp went from 8 to 12
  384. #
  385. # false negative percentages
  386. # 0.945 1.164 lost
  387. # 0.836 1.418 lost
  388. # 1.200 1.272 lost
  389. # 1.418 1.272 won
  390. # 1.455 1.273 won
  391. # 1.091 1.382 lost
  392. # 1.091 1.309 lost
  393. # 1.236 1.381 lost
  394. # 1.564 1.745 lost
  395. # 1.236 1.564 lost
  396. # 1.563 1.781 lost
  397. # 1.563 1.745 lost
  398. # 1.236 1.455 lost
  399. # 0.836 0.982 lost
  400. # 0.873 1.309 lost
  401. # 1.236 1.381 lost
  402. # 1.273 1.273 tied
  403. # 1.018 1.273 lost
  404. # 1.091 1.200 lost
  405. # 1.490 1.599 lost
  406. #
  407. # won 2 times
  408. # tied 1 times
  409. # lost 17 times
  410. #
  411. # total unique fn went from 292 to 327
  412. #
  413. # The messages merely discussing HTML were no longer fps, so it did what it
  414. # intended there. But the f-n rate nearly doubled on at least one run -- so
  415. # strong a set of spam indicators is the mere presence of HTML. The increase
  416. # in the number of fps despite that the HTML-discussing msgs left that
  417. # category remains mysterious to me, but it wasn't a significant increase
  418. # so I let it drop.
  419. #
  420. # Later: If I simply give up on making mailing lists friendly to my sisters
  421. # (they're not nerds, and create wonderfully attractive HTML msgs), a
  422. # compromise is to strip HTML tags from only text/plain msgs. That's
  423. # principled enough so far as it goes, and eliminates the HTML-discussing
  424. # false positives. It remains disturbing that the f-n rate on pure HTML
  425. # msgs increases significantly when stripping tags, so the code here doesn't
  426. # do that part. However, even after stripping tags, the rates above show that
  427. # at least 98% of spams are still correctly identified as spam.
  428. #
  429. # So, if another way is found to slash the f-n rate, the decision here not
  430. # to strip HTML from HTML-only msgs should be revisited.
  431. #
  432. # Later, after the f-n rate got slashed via other means:
  433. #
  434. # false positive percentages
  435. # 0.000 0.000 tied
  436. # 0.000 0.000 tied
  437. # 0.050 0.075 lost +50.00%
  438. # 0.025 0.025 tied
  439. # 0.075 0.025 won -66.67%
  440. # 0.000 0.000 tied
  441. # 0.100 0.100 tied
  442. # 0.050 0.075 lost +50.00%
  443. # 0.025 0.025 tied
  444. # 0.025 0.000 won -100.00%
  445. # 0.050 0.075 lost +50.00%
  446. # 0.050 0.050 tied
  447. # 0.050 0.025 won -50.00%
  448. # 0.000 0.000 tied
  449. # 0.000 0.000 tied
  450. # 0.075 0.075 tied
  451. # 0.025 0.025 tied
  452. # 0.000 0.000 tied
  453. # 0.025 0.025 tied
  454. # 0.050 0.050 tied
  455. #
  456. # won 3 times
  457. # tied 14 times
  458. # lost 3 times
  459. #
  460. # total unique fp went from 13 to 11
  461. #
  462. # false negative percentages
  463. # 0.327 0.400 lost +22.32%
  464. # 0.400 0.400 tied
  465. # 0.327 0.473 lost +44.65%
  466. # 0.691 0.654 won -5.35%
  467. # 0.545 0.473 won -13.21%
  468. # 0.291 0.364 lost +25.09%
  469. # 0.218 0.291 lost +33.49%
  470. # 0.654 0.654 tied
  471. # 0.364 0.473 lost +29.95%
  472. # 0.291 0.327 lost +12.37%
  473. # 0.327 0.291 won -11.01%
  474. # 0.691 0.654 won -5.35%
  475. # 0.582 0.655 lost +12.54%
  476. # 0.291 0.400 lost +37.46%
  477. # 0.364 0.436 lost +19.78%
  478. # 0.436 0.582 lost +33.49%
  479. # 0.436 0.364 won -16.51%
  480. # 0.218 0.291 lost +33.49%
  481. # 0.291 0.400 lost +37.46%
  482. # 0.254 0.327 lost +28.74%
  483. #
  484. # won 5 times
  485. # tied 2 times
  486. # lost 13 times
  487. #
  488. # total unique fn went from 106 to 122
  489. #
  490. # So HTML decorations are still a significant clue when the ham is composed
  491. # of c.l.py traffic. Again, this should be revisited if the f-n rate is
  492. # slashed again.
  493. #
  494. # Later: As the amount of training data increased, the effect of retaining
  495. # HTML tags decreased to insignificance. options.retain_pure_html_tags
  496. # was introduced to control this, and it defaulted to False. Later, as the
  497. # algorithm improved, retain_pure_html_tags was removed.
  498. #
  499. # Later: The decision to ignore "redundant" HTML is also dubious, since
  500. # the text/plain and text/html alternatives may have entirely different
  501. # content. options.ignore_redundant_html was introduced to control this,
  502. # and it defaults to False. Later: ignore_redundant_html was also removed.
  503. ##############################################################################
  504. # How big should "a word" be?
  505. #
  506. # As I write this, words less than 3 chars are ignored completely, and words
  507. # with more than 12 are special-cased, replaced with a summary "I skipped
  508. # about so-and-so many chars starting with such-and-such a letter" token.
  509. # This makes sense for English if most of the info is in "regular size"
  510. # words.
  511. #
  512. # A test run boosting to 13 had no effect on f-p rate, and did a little
  513. # better or worse than 12 across runs -- overall, no significant difference.
  514. # The database size is smaller at 12, so there's nothing in favor of 13.
  515. # A test at 11 showed a slight but consistent bad effect on the f-n rate
  516. # (lost 12 times, won once, tied 7 times).
  517. #
  518. # A test with no lower bound showed a significant increase in the f-n rate.
  519. # Curious, but not worth digging into. Boosting the lower bound to 4 is a
  520. # worse idea: f-p and f-n rates both suffered significantly then. I didn't
  521. # try testing with lower bound 2.
  522. #
  523. # Anthony Baxter found that boosting the option skip_max_word_size to 20
  524. # from its default of 12 produced a quite dramatic decrease in the number
  525. # of 'unsure' messages. However, this was coupled with a large increase
  526. # in the FN rate, and it remains unclear whether simply shifting cutoffs
  527. # would have given the same tradeoff (not enough data was posted to tell).
  528. #
  529. # On Tim's c.l.py test, 10-fold CV, ham_cutoff=0.20 and spam_cutoff=0.80:
  530. #
  531. # -> <stat> tested 2000 hams & 1400 spams against 18000 hams & 12600 spams
  532. # [ditto]
  533. #
  534. # filename: max12 max20
  535. # ham:spam: 20000:14000
  536. # 20000:14000
  537. # fp total: 2 2 the same
  538. # fp %: 0.01 0.01
  539. # fn total: 0 0 the same
  540. # fn %: 0.00 0.00
  541. # unsure t: 103 100 slight decrease
  542. # unsure %: 0.30 0.29
  543. # real cost: $40.60 $40.00 slight improvement with these cutoffs
  544. # best cost: $27.00 $27.40 best possible got slightly worse
  545. # h mean: 0.28 0.27
  546. # h sdev: 2.99 2.92
  547. # s mean: 99.94 99.93
  548. # s sdev: 1.41 1.47
  549. # mean diff: 99.66 99.66
  550. # k: 22.65 22.70
  551. #
  552. # "Best possible" in max20 would have been to boost ham_cutoff to 0.50(!),
  553. # and drop spam_cutoff a little to 0.78. This would have traded away most
  554. # of the unsures in return for letting 3 spam through:
  555. #
  556. # -> smallest ham & spam cutoffs 0.5 & 0.78
  557. # -> fp 2; fn 3; unsure ham 11; unsure spam 11
  558. # -> fp rate 0.01%; fn rate 0.0214%; unsure rate 0.0647%
  559. #
  560. # Best possible in max12 was much the same:
  561. #
  562. # -> largest ham & spam cutoffs 0.5 & 0.78
  563. # -> fp 2; fn 3; unsure ham 12; unsure spam 8
  564. # -> fp rate 0.01%; fn rate 0.0214%; unsure rate 0.0588%
  565. #
  566. # The classifier pickle size increased by about 1.5 MB (~8.4% bigger).
  567. #
  568. # Rob Hooft's results were worse:
  569. #
  570. # -> <stat> tested 1600 hams & 580 spams against 14400 hams & 5220 spams
  571. # [...]
  572. # -> <stat> tested 1600 hams & 580 spams against 14400 hams & 5220 spams
  573. # filename: skip12 skip20
  574. # ham:spam: 16000:5800
  575. # 16000:5800
  576. # fp total: 12 13
  577. # fp %: 0.07 0.08
  578. # fn total: 7 7
  579. # fn %: 0.12 0.12
  580. # unsure t: 178 184
  581. # unsure %: 0.82 0.84
  582. # real cost: $162.60 $173.80
  583. # best cost: $106.20 $109.60
  584. # h mean: 0.51 0.52
  585. # h sdev: 4.87 4.92
  586. # s mean: 99.42 99.39
  587. # s sdev: 5.22 5.34
  588. # mean diff: 98.91 98.87
  589. # k: 9.80 9.64
  590. # textparts(msg) returns a set containing all the text components of msg.
  591. # There's no point decoding binary blobs (like images). If a text/plain
  592. # and text/html part happen to have redundant content, it doesn't matter
  593. # to results, since training and scoring are done on the set of all
  594. # words in the msg, without regard to how many times a given word appears.
  595. def textparts(msg):
  596. """Return a set of all msg parts with content maintype 'text'."""
  597. return set(filter(lambda part: part.get_content_maintype() == 'text',
  598. msg.walk()))
  599. def octetparts(msg):
  600. """Return a set of all msg parts with type 'application/octet-stream'."""
  601. return set(filter(lambda part:
  602. part.get_content_type() == 'application/octet-stream',
  603. msg.walk()))
  604. def imageparts(msg):
  605. """Return a list of all msg parts with type 'image/*'."""
  606. # Don't want a set here because we want to be able to process them in
  607. # order.
  608. return filter(lambda part:
  609. part.get_content_type().startswith('image/'),
  610. msg.walk())
  611. has_highbit_char = re.compile(r"[\x80-\xff]").search
  612. # Cheap-ass gimmick to probabilistically find HTML/XML tags.
  613. # Note that <style and HTML comments are handled by crack_html_style()
  614. # and crack_html_comment() instead -- they can be very long, and long
  615. # minimal matches have a nasty habit of blowing the C stack.
  616. html_re = re.compile(r"""
  617. <
  618. (?![\s<>]) # e.g., don't match 'a < b' or '<<<' or 'i<<5' or 'a<>b'
  619. # guessing that other tags are usually "short"
  620. [^>]{0,256} # search for the end '>', but don't run wild
  621. >
  622. """, re.VERBOSE | re.DOTALL)
  623. # Trailing letter serves to reject "hostnames" which are really ip
  624. # addresses. Some spammers forge their apparent ip addresses, so you get
  625. # Received: headers which look like:
  626. # Received: from 199.249.165.175 ([218.5.93.116])
  627. # by manatee.mojam.com (8.12.1-20030917/8.12.1) with SMTP id
  628. # hBIERsqI018090
  629. # for <itinerary@musi-cal.com>; Thu, 18 Dec 2003 08:28:11 -0600
  630. # "199.249.165.175" is who the spamhaus said it was. That's really the
  631. # ip address of the receiving host (manatee.mojam.com), which correctly
  632. # identified the sender's ip address as 218.5.93.116.
  633. #
  634. # Similarly, the more complex character set instead of just \S serves to
  635. # reject Received: headers where the message bounces from one user to
  636. # another on the local machine:
  637. # Received: (from itin@localhost)
  638. # by manatee.mojam.com (8.12.1-20030917/8.12.1/Submit) id hBIEQFxF018044
  639. # for skip@manatee.mojam.com; Thu, 18 Dec 2003 08:26:15 -0600
  640. received_host_re = re.compile(r'from ([a-z0-9._-]+[a-z])[)\s]')
  641. # 99% of the time, the receiving host places the sender's ip address in
  642. # square brackets as it should, but every once in awhile it turns up in
  643. # parens. Yahoo seems to be guilty of this minor infraction:
  644. # Received: from unknown (66.218.66.218)
  645. # by m19.grp.scd.yahoo.com with QMQP; 19 Dec 2003 04:06:53 -0000
  646. received_ip_re = re.compile(r'[[(]((\d{1,3}\.?){4})[])]')
  647. received_nntp_ip_re = re.compile(r'((\d{1,3}\.?){4})')
  648. message_id_re = re.compile(r'\s*<[^@]+@([^>]+)>\s*')
  649. # I'm usually just splitting on whitespace, but for subject lines I want to
  650. # break things like "Python/Perl comparison?" up. OTOH, I don't want to
  651. # break up the unitized numbers in spammish subject phrases like "Increase
  652. # size 79%" or "Now only $29.95!". Then again, I do want to break up
  653. # "Python-Dev". Runs of punctuation are also interesting in subject lines.
  654. subject_word_re = re.compile(r"[\w\x80-\xff$.%]+")
  655. punctuation_run_re = re.compile(r'\W+')
  656. fname_sep_re = re.compile(r'[/\\:]')
  657. def crack_filename(fname):
  658. yield "fname:" + fname
  659. components = fname_sep_re.split(fname)
  660. morethan1 = len(components) > 1
  661. for component in components:
  662. if morethan1:
  663. yield "fname comp:" + component
  664. pieces = urlsep_re.split(component)
  665. if len(pieces) > 1:
  666. for piece in pieces:
  667. yield "fname piece:" + piece
  668. def tokenize_word(word, _len=len, maxword=options["Tokenizer",
  669. "skip_max_word_size"]):
  670. n = _len(word)
  671. # Make sure this range matches in tokenize().
  672. if 3 <= n <= maxword:
  673. yield word
  674. elif n >= 3:
  675. # A long word.
  676. # Don't want to skip embedded email addresses.
  677. # An earlier scheme also split up the y in x@y on '.'. Not splitting
  678. # improved the f-n rate; the f-p rate didn't care either way.
  679. if n < 40 and '.' in word and word.count('@') == 1:
  680. p1, p2 = word.split('@')
  681. yield 'email name:' + p1
  682. yield 'email addr:' + p2
  683. else:
  684. # There's value in generating a token indicating roughly how
  685. # many chars were skipped. This has real benefit for the f-n
  686. # rate, but is neutral for the f-p rate. I don't know why!
  687. # XXX Figure out why, and/or see if some other way of summarizing
  688. # XXX this info has greater benefit.
  689. if options["Tokenizer", "generate_long_skips"]:
  690. yield "skip:%c %d" % (word[0], n // 10 * 10)
  691. if has_highbit_char(word):
  692. hicount = 0
  693. for i in map(ord, word):
  694. if i >= 128:
  695. hicount += 1
  696. yield "8bit%%:%d" % round(hicount * 100.0 / len(word))
  697. # Generate tokens for:
  698. # Content-Type
  699. # and its type= param
  700. # Content-Dispostion
  701. # and its filename= param
  702. # all the charsets
  703. #
  704. # This has huge benefit for the f-n rate, and virtually no effect on the f-p
  705. # rate, although it does reduce the variance of the f-p rate across different
  706. # training sets (really marginal msgs, like a brief HTML msg saying just
  707. # "unsubscribe me", are almost always tagged as spam now; before they were
  708. # right on the edge, and now the multipart/alternative pushes them over it
  709. # more consistently).
  710. #
  711. # XXX I put all of this in as one chunk. I don't know which parts are
  712. # XXX most effective; it could be that some parts don't help at all. But
  713. # XXX given the nature of the c.l.py tests, it's not surprising that the
  714. # XXX 'content-type:text/html'
  715. # XXX token is now the single most powerful spam indicator (== makes it
  716. # XXX into the nbest list most often). What *is* a little surprising is
  717. # XXX that this doesn't push more mixed-type msgs into the f-p camp --
  718. # XXX unlike looking at *all* HTML tags, this is just one spam indicator
  719. # XXX instead of dozens, so relevant msg content can cancel it out.
  720. #
  721. # A bug in this code prevented Content-Transfer-Encoding from getting
  722. # picked up. Fixing that bug showed that it didn't help, so the corrected
  723. # code is disabled now (left column without Content-Transfer-Encoding,
  724. # right column with it);
  725. #
  726. # false positive percentages
  727. # 0.000 0.000 tied
  728. # 0.000 0.000 tied
  729. # 0.100 0.100 tied
  730. # 0.000 0.000 tied
  731. # 0.025 0.025 tied
  732. # 0.025 0.025 tied
  733. # 0.100 0.100 tied
  734. # 0.025 0.025 tied
  735. # 0.025 0.025 tied
  736. # 0.050 0.050 tied
  737. # 0.100 0.100 tied
  738. # 0.025 0.025 tied
  739. # 0.025 0.025 tied
  740. # 0.025 0.025 tied
  741. # 0.025 0.025 tied
  742. # 0.025 0.025 tied
  743. # 0.025 0.025 tied
  744. # 0.000 0.025 lost +(was 0)
  745. # 0.025 0.025 tied
  746. # 0.100 0.100 tied
  747. #
  748. # won 0 times
  749. # tied 19 times
  750. # lost 1 times
  751. #
  752. # total unique fp went from 9 to 10
  753. #
  754. # false negative percentages
  755. # 0.364 0.400 lost +9.89%
  756. # 0.400 0.364 won -9.00%
  757. # 0.400 0.436 lost +9.00%
  758. # 0.909 0.872 won -4.07%
  759. # 0.836 0.836 tied
  760. # 0.618 0.618 tied
  761. # 0.291 0.291 tied
  762. # 1.018 0.981 won -3.63%
  763. # 0.982 0.982 tied
  764. # 0.727 0.727 tied
  765. # 0.800 0.800 tied
  766. # 1.163 1.127 won -3.10%
  767. # 0.764 0.836 lost +9.42%
  768. # 0.473 0.473 tied
  769. # 0.473 0.618 lost +30.66%
  770. # 0.727 0.763 lost +4.95%
  771. # 0.655 0.618 won -5.65%
  772. # 0.509 0.473 won -7.07%
  773. # 0.545 0.582 lost +6.79%
  774. # 0.509 0.509 tied
  775. #
  776. # won 6 times
  777. # tied 8 times
  778. # lost 6 times
  779. #
  780. # total unique fn went from 168 to 169
  781. # For support of the replace_nonascii_chars option, build a string.translate
  782. # table that maps all high-bit chars and control chars to a '?' character.
  783. non_ascii_translate_tab = ['?'] * 256
  784. # leave blank up to (but not including) DEL alone
  785. for i in range(32, 127):
  786. non_ascii_translate_tab[i] = chr(i)
  787. # leave "normal" whitespace alone
  788. for ch in ' \t\r\n':
  789. non_ascii_translate_tab[ord(ch)] = ch
  790. del i, ch
  791. non_ascii_translate_tab = ''.join(non_ascii_translate_tab)
  792. def crack_content_xyz(msg):
  793. yield 'content-type:' + msg.get_content_type()
  794. x = msg.get_param('type')
  795. if x is not None:
  796. yield 'content-type/type:' + x.lower()
  797. try:
  798. for x in msg.get_charsets(None):
  799. if x is not None:
  800. yield 'charset:' + x.lower()
  801. except UnicodeEncodeError:
  802. # Bad messages can cause an exception here.
  803. # See [ 1175439 ] UnicodeEncodeError raised for bogus Content-Type
  804. # header
  805. yield 'charset:invalid_unicode'
  806. x = msg.get('content-disposition')
  807. if x is not None:
  808. yield 'content-disposition:' + x.lower()
  809. try:
  810. fname = msg.get_filename()
  811. if fname is not None:
  812. for x in crack_filename(fname):
  813. yield 'filename:' + x
  814. except TypeError:
  815. # bug in email pkg? see the thread beginning at
  816. # http://mail.python.org/pipermail/spambayes/2003-September/008006.html
  817. # and
  818. # http://mail.python.org/pipermail/spambayes-dev/2003-September/001177.html
  819. yield "filename:<bogus>"
  820. if 0: # disabled; see comment before function
  821. x = msg.get('content-transfer-encoding')
  822. if x is not None:
  823. yield 'content-transfer-encoding:' + x.lower()
  824. # The base64 decoder is actually very forgiving, but flubs one case:
  825. # if no padding is required (no trailing '='), it continues to read
  826. # following lines as if they were still part of the base64 part. We're
  827. # actually stricter here. The *point* is that some mailers tack plain
  828. # text on to the end of base64-encoded text sections.
  829. # Match a line of base64, up to & including the trailing newline.
  830. # We allow for optional leading and trailing whitespace, and don't care
  831. # about line length, but other than that are strict. Group 1 is non-empty
  832. # after a match iff the last significant char on the line is '='; in that
  833. # case, it must be the last line of the base64 section.
  834. base64_re = re.compile(r"""
  835. [ \t]*
  836. [a-zA-Z0-9+/]*
  837. (=*)
  838. [ \t]*
  839. \r?
  840. \n
  841. """, re.VERBOSE)
  842. def try_to_repair_damaged_base64(text):
  843. i = 0
  844. while True:
  845. # text[:i] looks like base64. Does the line starting at i also?
  846. m = base64_re.match(text, i)
  847. if not m:
  848. break
  849. i = m.end()
  850. if m.group(1):
  851. # This line has a trailing '=' -- the base64 part is done.
  852. break
  853. base64text = ''
  854. if i:
  855. base64 = text[:i]
  856. try:
  857. base64text = binascii.a2b_base64(base64)
  858. except:
  859. # There's no point in tokenizing raw base64 gibberish.
  860. pass
  861. return base64text + text[i:]
  862. def breakdown_host(host):
  863. parts = host.split('.')
  864. for i in range(1, len(parts) + 1):
  865. yield '.'.join(parts[-i:])
  866. def breakdown_ipaddr(ipaddr):
  867. parts = ipaddr.split('.')
  868. for i in range(1, 5):
  869. yield '.'.join(parts[:i])
  870. def log2(n, log=math.log, c=math.log(2)):
  871. return log(n)/c
  872. class Stripper(object):
  873. # The retained portions are catenated together with self.separator.
  874. # CAUTION: This used to be blank. But then I noticed spam putting
  875. # HTML comments embedded in words, like
  876. # FR<!--slkdflskjf-->EE!
  877. # Breaking this into "FR" and "EE!" wasn't a real help <wink>.
  878. separator = '' # a subclass can override if this isn't appropriate
  879. def __init__(self, find_start, find_end):
  880. # find_start and find_end have signature
  881. # string, int -> match_object
  882. # where the search starts at string[int:int]. If a match isn't found,
  883. # they must return None. The match_object for find_start, if not
  884. # None, is passed to self.tokenize, which returns a (possibly empty)
  885. # list of tokens to generate. Subclasses may override tokenize().
  886. # Text between find_start and find_end is thrown away, except for
  887. # whatever tokenize() produces. A match_object must support method
  888. # span() -> int, int # the slice bounds of what was matched
  889. self.find_start = find_start
  890. self.find_end = find_end
  891. # Efficiency note: This is cheaper than it looks if there aren't any
  892. # special sections. Under the covers, string[0:] is optimized to
  893. # return string (no new object is built), and likewise ' '.join([string])
  894. # is optimized to return string. It would actually slow this code down
  895. # to special-case these "do nothing" special cases at the Python level!
  896. def analyze(self, text):
  897. i = 0
  898. retained = []
  899. pushretained = retained.append
  900. tokens = []
  901. while True:
  902. m = self.find_start(text, i)
  903. if not m:
  904. pushretained(text[i:])
  905. break
  906. start, end = m.span()
  907. pushretained(text[i : start])
  908. tokens.extend(self.tokenize(m))
  909. m = self.find_end(text, end)
  910. if not m:
  911. # No matching end - act as if the open
  912. # tag did not exist.
  913. pushretained(text[start:])
  914. break
  915. dummy, i = m.span()
  916. return self.separator.join(retained), tokens
  917. def tokenize(self, match_object):
  918. # Override this if you want to suck info out of the start pattern.
  919. return []
  920. # Strip out uuencoded sections and produce tokens. The return value
  921. # is (new_text, sequence_of_tokens), where new_text no longer contains
  922. # uuencoded stuff. Note that we're not bothering to decode it! Maybe
  923. # we should. One of my persistent false negatives is a spam containing
  924. # nothing but a uuencoded money.txt; OTOH, uuencode seems to be on
  925. # its way out (that's an old spam).
  926. uuencode_begin_re = re.compile(r"""
  927. ^begin \s+
  928. (\S+) \s+ # capture mode
  929. (\S+) \s* # capture filename
  930. $
  931. """, re.VERBOSE | re.MULTILINE)
  932. uuencode_end_re = re.compile(r"^end\s*\n", re.MULTILINE)
  933. class UUencodeStripper(Stripper):
  934. def __init__(self):
  935. Stripper.__init__(self, uuencode_begin_re.search,
  936. uuencode_end_re.search)
  937. def tokenize(self, m):
  938. mode, fname = m.groups()
  939. return (['uuencode mode:%s' % mode] +
  940. ['uuencode:%s' % x for x in crack_filename(fname)])
  941. crack_uuencode = UUencodeStripper().analyze
  942. # Strip and specially tokenize embedded URLish thingies.
  943. url_fancy_re = re.compile(r"""
  944. \b # the preceeding character must not be alphanumeric
  945. (?:
  946. (?:
  947. (https? | ftp) # capture the protocol
  948. :// # skip the boilerplate
  949. )|
  950. (?= ftp\.[^\.\s<>"'\x7f-\xff] )| # allow the protocol to be missing, but only if
  951. (?= www\.[^\.\s<>"'\x7f-\xff] ) # the rest of the url starts "www.x" or "ftp.x"
  952. )
  953. # Do a reasonable attempt at detecting the end. It may or may not
  954. # be in HTML, may or may not be in quotes, etc. If it's full of %
  955. # escapes, cool -- that's a clue too.
  956. ([^\s<>"'\x7f-\xff]+) # capture the guts
  957. """, re.VERBOSE) # '
  958. url_re = re.compile(r"""
  959. (https? | ftp) # capture the protocol
  960. :// # skip the boilerplate
  961. # Do a reasonable attempt at detecting the end. It may or may not
  962. # be in HTML, may or may not be in quotes, etc. If it's full of %
  963. # escapes, cool -- that's a clue too.
  964. ([^\s<>"'\x7f-\xff]+) # capture the guts
  965. """, re.VERBOSE) # '
  966. urlsep_re = re.compile(r"[;?:@&=+,$.]")
  967. class URLStripper(Stripper):
  968. def __init__(self):
  969. # The empty regexp matches anything at once.
  970. if options["Tokenizer", "x-fancy_url_recognition"]:
  971. search = url_fancy_re.search
  972. else:
  973. search = url_re.search
  974. Stripper.__init__(self, search, re.compile("").search)
  975. def tokenize(self, m):
  976. proto, guts = m.groups()
  977. assert guts
  978. if proto is None:
  979. if guts.lower().startswith("www"):
  980. proto = "http"
  981. elif guts.lower().startswith("ftp"):
  982. proto = "ftp"
  983. else:
  984. proto = "unknown"
  985. tokens = ["proto:" + proto]
  986. pushclue = tokens.append
  987. if options["Tokenizer", "x-pick_apart_urls"]:
  988. url = proto + "://" + guts
  989. escapes = re.findall(r'%..', guts)
  990. # roughly how many %nn escapes are there?
  991. if escapes:
  992. pushclue("url:%%%d" % int(log2(len(escapes))))
  993. # %nn escapes are usually intentional obfuscation. Generate a
  994. # lot of correlated tokens if the URL contains a lot of them.
  995. # The classifier will learn which specific ones are and aren't
  996. # spammy.
  997. tokens.extend(["url:" + escape for escape in escapes])
  998. # now remove any obfuscation and probe around a bit
  999. url = urllib.unquote(url)
  1000. scheme, netloc, path, params, query, frag = urlparse.urlparse(url)
  1001. if options["Tokenizer", "x-lookup_ip"]:
  1002. ips = cache.lookup(netloc)
  1003. if not ips:
  1004. pushclue("url-ip:lookup error")
  1005. else:
  1006. for clue in gen_dotted_quad_clues("url-ip", ips):
  1007. pushclue(clue)
  1008. # one common technique in bogus "please (re-)authorize yourself"
  1009. # scams is to make it appear as if you're visiting a valid
  1010. # payment-oriented site like PayPal, CitiBank or eBay, when you
  1011. # actually aren't. The company's web server appears as the
  1012. # beginning of an often long username element in the URL such as
  1013. # http://www.paypal.com%65%43%99%35@10.0.1.1/iwantyourccinfo
  1014. # generally with an innocuous-looking fragment of text or a
  1015. # valid URL as the highlighted link. Usernames should rarely
  1016. # appear in URLs (perhaps in a local bookmark you established),
  1017. # and never in a URL you receive from an unsolicited email or
  1018. # another website.
  1019. user_pwd, host_port = urllib.splituser(netloc)
  1020. if user_pwd is not None:
  1021. pushclue("url:has user")
  1022. host, port = urllib.splitport(host_port)
  1023. # web servers listening on non-standard ports are suspicious ...
  1024. if port is not None:
  1025. if (scheme == "http" and port != '80' or
  1026. scheme == "https" and port != '443'):
  1027. pushclue("url:non-standard %s port" % scheme)
  1028. # ... as are web servers associated with raw ip addresses
  1029. if re.match("(\d+\.?){4,4}$", host) is not None:
  1030. pushclue("url:ip addr")
  1031. # make sure we later tokenize the unobfuscated url bits
  1032. proto, guts = url.split("://", 1)
  1033. # Lose the trailing punctuation for casual embedding, like:
  1034. # The code is at http://mystuff.org/here? Didn't resolve.
  1035. # or
  1036. # I found it at http://mystuff.org/there/. Thanks!
  1037. while guts and guts[-1] in '.:?!/':
  1038. guts = guts[:-1]
  1039. for piece in guts.split('/'):
  1040. for chunk in urlsep_re.split(piece):
  1041. pushclue("url:" + chunk)
  1042. return tokens
  1043. received_complaints_re = re.compile(r'\([a-z]+(?:\s+[a-z]+)+\)')
  1044. class SlurpingURLStripper(URLStripper):
  1045. def __init__(self):
  1046. URLStripper.__init__(self)
  1047. def analyze(self, text):
  1048. # If there are no URLS, then we need to clear the
  1049. # wordstream, or whatever was there from the last message
  1050. # will be used.
  1051. classifier.slurp_wordstream = None
  1052. # Continue as normal.
  1053. return URLStripper.analyze(self, text)
  1054. def tokenize(self, m):
  1055. # XXX Note that the 'slurped' tokens are *always* trained
  1056. # XXX on; it would be simple to change/parameterize this.
  1057. tokens = URLStripper.tokenize(self, m)
  1058. if not options["URLRetriever", "x-slurp_urls"]:
  1059. return tokens
  1060. proto, guts = m.groups()
  1061. if proto != "http":
  1062. return tokens
  1063. assert guts
  1064. while guts and guts[-1] in '.:;?!/)':
  1065. guts = guts[:-1]
  1066. classifier.slurp_wordstream = (proto, guts)
  1067. return tokens
  1068. if options["URLRetriever", "x-slurp_urls"]:
  1069. crack_urls = SlurpingURLStripper().analyze
  1070. else:
  1071. crack_urls = URLStripper().analyze
  1072. # Nuke HTML <style gimmicks.
  1073. html_style_start_re = re.compile(r"""
  1074. < \s* style\b [^>]* >
  1075. """, re.VERBOSE)
  1076. class StyleStripper(Stripper):
  1077. def __init__(self):
  1078. Stripper.__init__(self, html_style_start_re.search,
  1079. re.compile(r"</style>").search)
  1080. crack_html_style = StyleStripper().analyze
  1081. # Nuke HTML comments.
  1082. class CommentStripper(Stripper):
  1083. def __init__(self):
  1084. Stripper.__init__(self,
  1085. re.compile(r"<!--|<\s*comment\s*[^>]*>").search,
  1086. re.compile(r"-->|</comment>").search)
  1087. crack_html_comment = CommentStripper().analyze
  1088. # Nuke stuff between <noframes> </noframes> tags.
  1089. class NoframesStripper(Stripper):
  1090. def __init__(self):
  1091. Stripper.__init__(self,
  1092. re.compile(r"<\s*noframes\s*>").search,
  1093. re.compile(r"</noframes\s*>").search)
  1094. crack_noframes = NoframesStripper().analyze
  1095. # Scan HTML for constructs often seen in viruses and worms.
  1096. # <script </script
  1097. # <iframe </iframe
  1098. # src=cid:
  1099. # height=0 width=0
  1100. virus_re = re.compile(r"""
  1101. < /? \s* (?: script | iframe) \b
  1102. | \b src= ['"]? cid:
  1103. | \b (?: height | width) = ['"]? 0
  1104. """, re.VERBOSE) # '
  1105. def find_html_virus_clues(text):
  1106. for bingo in virus_re.findall(text):
  1107. yield bingo
  1108. numeric_entity_re = re.compile(r'&#(\d+);')
  1109. def numeric_entity_replacer(m):
  1110. try:
  1111. return chr(int(m.group(1)))
  1112. except:
  1113. return '?'
  1114. breaking_entity_re = re.compile(r"""
  1115. &nbsp;
  1116. | < (?: p
  1117. | br
  1118. )
  1119. >
  1120. """, re.VERBOSE)
  1121. class Tokenizer:
  1122. date_hms_re = re.compile(r' (?P<hour>[0-9][0-9])'
  1123. r':(?P<minute>[0-9][0-9])'
  1124. r'(?::[0-9][0-9])? ')
  1125. date_formats = ("%a, %d %b %Y %H:%M:%S (%Z)",
  1126. "%a, %d %b %Y %H:%M:%S %Z",
  1127. "%d %b %Y %H:%M:%S (%Z)",
  1128. "%d %b %Y %H:%M:%S %Z",
  1129. "%a, %d %b %Y %H:%M (%Z)",
  1130. "%a, %d %b %Y %H:%M %Z",
  1131. "%d %b %Y %H:%M (%Z)",
  1132. "%d %b %Y %H:%M %Z")
  1133. def __init__(self):
  1134. self.setup()
  1135. def setup(self):
  1136. """Get the tokenizer ready to use; this should be called after
  1137. all options have been set."""
  1138. # We put this here, rather than in __init__, so that this can be
  1139. # done after we set options at runtime (since the tokenizer
  1140. # instance is generally created when this module is imported).
  1141. if options["Tokenizer", "basic_header_tokenize"]:
  1142. self.basic_skip = [re.compile(s)
  1143. for s in options["Tokenizer",
  1144. "basic_header_skip"]]
  1145. def get_message(self, obj):
  1146. return get_message(obj)
  1147. def tokenize(self, obj):
  1148. msg = self.get_message(obj)
  1149. for tok in self.tokenize_headers(msg):
  1150. yield tok
  1151. for tok in self.tokenize_body(msg):
  1152. yield tok
  1153. def tokenize_headers(self, msg):
  1154. # Special tagging of header lines and MIME metadata.
  1155. # Content-{Type, Disposition} and their params, and charsets.
  1156. # This is done for all MIME sections.
  1157. for x in msg.walk():
  1158. for w in crack_content_xyz(x):
  1159. yield w
  1160. # The rest is solely tokenization of header lines.
  1161. # XXX The headers in my (Tim's) spam and ham corpora are so different
  1162. # XXX (they came from different sources) that including several kinds
  1163. # XXX of header analysis renders the classifier's job trivial. So
  1164. # XXX lots of this is crippled now, controlled by an ever-growing
  1165. # XXX collection of funky options.
  1166. # Basic header tokenization
  1167. # Tokenize the contents of each header field in the way Subject lines
  1168. # are tokenized later.
  1169. # XXX Different kinds of tokenization have gotten better results on
  1170. # XXX different header lines. No experiments have been run on
  1171. # XXX whether the best choice is being made for each of the header
  1172. # XXX lines tokenized by this section.
  1173. # The name of the header is used as a tag. Tokens look like
  1174. # "header:word". The basic approach is simple and effective, but
  1175. # also very sensitive to biases in the ham and spam collections.
  1176. # For example, if the ham and spam were collected at different
  1177. # times, several headers with date/time information will become
  1178. # the best discriminators.
  1179. # (Not just Date, but Received and X-From_.)
  1180. if options["Tokenizer", "basic_header_tokenize"]:
  1181. for k, v in msg.items():
  1182. k = k.lower()
  1183. for rx in self.basic_skip:
  1184. if rx.match(k):
  1185. break # do nothing -- we're supposed to skip this
  1186. else:
  1187. # Never found a match -- don't skip this.
  1188. for w in subject_word_re.findall(v):
  1189. for t in tokenize_word(w):
  1190. yield "%s:%s" % (k, t)
  1191. if options["Tokenizer", "basic_header_tokenize_only"]:
  1192. return
  1193. # Habeas Headers - see http://www.habeas.com
  1194. if options["Tokenizer", "x-search_for_habeas_headers"]:
  1195. habeas_headers = [
  1196. ("X-Habeas-SWE-1", "winter into spring"),
  1197. ("X-Habeas-SWE-2", "brightly anticipated"),
  1198. ("X-Habeas-SWE-3", "like Habeas SWE (tm)"),
  1199. ("X-Habeas-SWE-4", "Copyright 2002 Habeas (tm)"),
  1200. ("X-Habeas-SWE-5", "Sender Warranted Email (SWE) (tm). The sender of this"),
  1201. ("X-Habeas-SWE-6", "email in exchange for a license for this Habeas"),
  1202. ("X-Habeas-SWE-7", "warrant mark warrants that this is a Habeas Compliant"),
  1203. ("X-Habeas-SWE-8", "Message (HCM) and not spam. Please report use of this"),
  1204. ("X-Habeas-SWE-9", "mark in spam to <http://www.habeas.com/report/>.")
  1205. ]
  1206. valid_habeas = 0
  1207. invalid_habeas = False
  1208. for opt, val in habeas_headers:
  1209. habeas = msg.get(opt)
  1210. if habeas is not None:
  1211. if options["Tokenizer", "x-reduce_habeas_headers"]:
  1212. if habeas == val:
  1213. valid_habeas += 1
  1214. else:
  1215. invalid_habeas = True
  1216. else:
  1217. if habeas == val:
  1218. yield opt.lower() + ":valid"
  1219. else:
  1220. yield opt.lower() + ":invalid"
  1221. if options["Tokenizer", "x-reduce_habeas_headers"]:
  1222. # If there was any invalid line, we record as invalid.
  1223. # If all nine lines were corr

Large files files are truncated, but you can click here to view the full file