PageRenderTime 61ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/unladen_swallow/lib/spambayes/scripts/sb_imapfilter.py

https://bitbucket.org/csenger/benchmarks
Python | 1324 lines | 1257 code | 11 blank | 56 comment | 15 complexity | c0543bea710b8f588d097fea5cde8098 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0, GPL-2.0

Large files files are truncated, but you can click here to view the full file

  1. #!/usr/bin/env python
  2. """An IMAP filter. An IMAP message box is scanned and all non-scored
  3. messages are scored and (where necessary) filtered.
  4. Usage:
  5. sb_imapfilter [options]
  6. note: option values with spaces in them must be enclosed
  7. in double quotes
  8. options:
  9. -p dbname : pickled training database filename
  10. -d dbname : dbm training database filename
  11. -t : train contents of spam folder and ham folder
  12. -c : classify inbox
  13. -h : display this message
  14. -v : verbose mode
  15. -P : security option to prompt for imap password,
  16. rather than look in options["imap", "password"]
  17. -e y/n : expunge/purge messages on exit (y) or not (n)
  18. -i debuglvl : a somewhat mysterious imaplib debugging level
  19. (4 is a good level, and suitable for bug reports)
  20. -l minutes : period of time between filtering operations
  21. -b : Launch a web browser showing the user interface.
  22. -o section:option:value :
  23. set [section, option] in the options database
  24. to value
  25. Examples:
  26. Classify inbox, with dbm database
  27. sb_imapfilter -c -d bayes.db
  28. Train Spam and Ham, then classify inbox, with dbm database
  29. sb_imapfilter -t -c -d bayes.db
  30. Train Spam and Ham only, with pickled database
  31. sb_imapfilter -t -p bayes.db
  32. Warnings:
  33. o We never delete mail, unless you use the -e/purge option, but we do
  34. mark a lot as deleted, and your mail client might remove that for
  35. you. We try to only mark as deleted once the moved/altered message
  36. is correctly saved, but things might go wrong. We *strongly*
  37. recommend that you try this script out on mail that you can recover
  38. from somewhere else, at least at first.
  39. """
  40. from __future__ import generators
  41. todo = """
  42. o IMAP supports authentication via other methods than the plain-text
  43. password method that we are using at the moment. Neither of the
  44. servers I have access to offer any alternative method, however. If
  45. someone's does, then it would be nice to offer this.
  46. Thanks to #1169939 we now support CRAM_MD5 if available. It'd still
  47. be good to support others, though.
  48. o Usernames should be able to be literals as well as quoted strings.
  49. This might help if the username/password has special characters like
  50. accented characters.
  51. o Suggestions?
  52. """
  53. # This module is part of the SpamBayes project, which is Copyright 2002-2007
  54. # The Python Software Foundation and is covered by the Python Software
  55. # Foundation license.
  56. __author__ = "Tony Meyer <ta-meyer@ihug.co.nz>, Tim Stone"
  57. __credits__ = "All the SpamBayes folk. The original filter design owed " \
  58. "much to isbg by Roger Binns (http://www.rogerbinns.com/isbg)."
  59. # If we are running as a frozen application, then chances are that
  60. # output is just lost. We'd rather log this, like sb_server and Oulook
  61. # log, so that the user can pull up the output if possible. We could just
  62. # rely on the user piping the output appropriately, but would rather have
  63. # more control. The sb_server tray application only does this if not
  64. # running in a console window, but we do it whenever we are frozen.
  65. import os
  66. import sys
  67. if hasattr(sys, "frozen"):
  68. # We want to move to logging module later, so for now, we
  69. # hack together a simple logging strategy.
  70. try:
  71. import win32api
  72. except ImportError:
  73. if sys.platform == "win32":
  74. # Fall back to CWD, but warn user.
  75. status = "Warning: your log is stored in the current " \
  76. "working directory. We recommend installing " \
  77. "the pywin32 extensions, so that the log is " \
  78. "stored in the Windows temp directory."
  79. temp_dir = os.getcwd()
  80. else:
  81. # Try for a /tmp directory.
  82. if os.path.isdir("/tmp"):
  83. temp_dir = "/tmp"
  84. status = "Log file opened in /tmp"
  85. else:
  86. status = "Warning: your log is stored in the current " \
  87. "working directory. If this does not suit you " \
  88. "please let the spambayes@python.org crowd know " \
  89. "so that an alternative can be arranged."
  90. else:
  91. temp_dir = win32api.GetTempPath()
  92. status = "Log file opened in " + temp_dir
  93. for i in range(3, 0, -1):
  94. try:
  95. os.unlink(os.path.join(temp_dir, "SpamBayesIMAP%d.log" % (i+1)))
  96. except os.error:
  97. pass
  98. try:
  99. os.rename(
  100. os.path.join(temp_dir, "SpamBayesIMAP%d.log" % i),
  101. os.path.join(temp_dir, "SpamBayesIMAP%d.log" % (i+1))
  102. )
  103. except os.error:
  104. pass
  105. # Open this log, as unbuffered, so crashes still get written.
  106. sys.stdout = open(os.path.join(temp_dir,"SpamBayesIMAP1.log"), "wt", 0)
  107. sys.stderr = sys.stdout
  108. import socket
  109. import re
  110. import time
  111. import getopt
  112. import types
  113. import thread
  114. import email
  115. import email.Parser
  116. from getpass import getpass
  117. from email.Utils import parsedate
  118. from spambayes import Stats
  119. from spambayes import message
  120. from spambayes.Options import options, optionsPathname
  121. from spambayes import storage, Dibbler
  122. from spambayes.UserInterface import UserInterfaceServer
  123. from spambayes.ImapUI import IMAPUserInterface, LoginFailure
  124. from spambayes.Version import get_current_version
  125. from imaplib import IMAP4
  126. from imaplib import Time2Internaldate
  127. try:
  128. if options["imap", "use_ssl"]:
  129. from imaplib import IMAP4_SSL as BaseIMAP
  130. else:
  131. from imaplib import IMAP4 as BaseIMAP
  132. except ImportError:
  133. from imaplib import IMAP4 as BaseIMAP
  134. class BadIMAPResponseError(Exception):
  135. """An IMAP command returned a non-"OK" response."""
  136. def __init__(self, command, response):
  137. self.command = command
  138. self.response = response
  139. def __str__(self):
  140. return "The command '%s' failed to give an OK response.\n%s" % \
  141. (self.command, self.response)
  142. class IMAPSession(BaseIMAP):
  143. '''A class extending the IMAP4 class, with a few optimizations'''
  144. timeout = 60 # seconds
  145. def __init__(self, server, debug=0, do_expunge = options["imap", "expunge"] ):
  146. if ":" in server:
  147. server, port = server.split(':', 1)
  148. port = int(port)
  149. else:
  150. if options["imap", "use_ssl"]:
  151. port = 993
  152. else:
  153. port = 143
  154. # There's a tricky situation where if use_ssl is False, but we
  155. # try to connect to a IMAP over SSL server, we will just hang
  156. # forever, waiting for a response that will never come. To
  157. # get past this, just for the welcome message, we install a
  158. # timeout on the connection. Normal service is then returned.
  159. # This only applies when we are not using SSL.
  160. if not hasattr(self, "ssl"):
  161. readline = self.readline
  162. self.readline = self.readline_timeout
  163. try:
  164. BaseIMAP.__init__(self, server, port)
  165. except (BaseIMAP.error, socket.gaierror, socket.error):
  166. if options["globals", "verbose"]:
  167. print >> sys.stderr, "Cannot connect to server", server, "on port", port
  168. if not hasattr(self, "ssl"):
  169. print >> sys.stderr, ("If you are connecting to an SSL server,"
  170. "please ensure that you\n"
  171. "have the 'Use SSL' option enabled.")
  172. self.connected = False
  173. else:
  174. self.connected = True
  175. if not hasattr(self, "ssl"):
  176. self.readline = readline
  177. self.debug = debug
  178. self.do_expunge = do_expunge
  179. self.server = server
  180. self.port = port
  181. self.logged_in = False
  182. # For efficiency, we remember which folder we are currently
  183. # in, and only send a select command to the IMAP server if
  184. # we want to *change* folders. This functionality is used by
  185. # both IMAPMessage and IMAPFolder.
  186. self.current_folder = None
  187. # We override the base read so that we only read a certain amount
  188. # of data at a time. OS X and Python has problems with getting
  189. # large amounts of memory at a time, so maybe this will be a way we
  190. # can work around that (I don't know, and don't have a mac to test,
  191. # but we need to try something).
  192. self._read = self.read
  193. self.read = self.safe_read
  194. def readline_timeout(self):
  195. """Read line from remote, possibly timing out."""
  196. st_time = time.time()
  197. self.sock.setblocking(False)
  198. buffer = []
  199. while True:
  200. if (time.time() - st_time) > self.timeout:
  201. if options["globals", "verbose"]:
  202. print >> sys.stderr, "IMAP Timing out"
  203. break
  204. try:
  205. data = self.sock.recv(1)
  206. except socket.error, e:
  207. if e[0] == 10035:
  208. # Nothing to receive, keep going.
  209. continue
  210. raise
  211. if not data:
  212. break
  213. if data == '\n':
  214. break
  215. buffer.append(data)
  216. self.sock.setblocking(True)
  217. return "".join(buffer)
  218. def login(self, username, pwd):
  219. """Log in to the IMAP server, catching invalid username/password."""
  220. assert self.connected, "Must be connected before logging in."
  221. if 'AUTH=CRAM-MD5' in self.capabilities:
  222. login_func = self.login_cram_md5
  223. args = (username, pwd)
  224. description = "MD5"
  225. else:
  226. login_func = BaseIMAP.login # superclass login
  227. args = (self, username, pwd)
  228. description = "plain-text"
  229. try:
  230. login_func(*args)
  231. except BaseIMAP.error, e:
  232. msg = "The username (%s) and/or password (sent in %s) may " \
  233. "be incorrect." % (username, description)
  234. raise LoginFailure(msg)
  235. self.logged_in = True
  236. def logout(self):
  237. """Log off from the IMAP server, possibly expunging.
  238. Note that most, if not all, of the expunging is probably done in
  239. SelectFolder, rather than here, for purposes of speed."""
  240. # We may never have logged in, in which case we do nothing.
  241. if self.connected and self.logged_in and self.do_expunge:
  242. # Expunge messages from the ham, spam and unsure folders.
  243. for fol in ["spam_folder",
  244. "unsure_folder",
  245. "ham_folder"]:
  246. folder_name = options["imap", fol]
  247. if folder_name:
  248. self.select(folder_name)
  249. self.expunge()
  250. # Expunge messages from the ham and spam training folders.
  251. for fol_list in ["ham_train_folders",
  252. "spam_train_folders",]:
  253. for fol in options["imap", fol_list]:
  254. self.select(fol)
  255. self.expunge()
  256. BaseIMAP.logout(self) # superclass logout
  257. def check_response(self, command, IMAP_response):
  258. """A utility function to check the response from IMAP commands.
  259. Raises BadIMAPResponseError if the response is not OK. Returns
  260. the data segment of the response otherwise."""
  261. response, data = IMAP_response
  262. if response != "OK":
  263. raise BadIMAPResponseError(command, IMAP_response)
  264. return data
  265. def SelectFolder(self, folder):
  266. """A method to point ensuing IMAP operations at a target folder.
  267. This is essentially a wrapper around the IMAP select command, which
  268. ignores the command if the folder is already selected."""
  269. if self.current_folder != folder:
  270. if self.current_folder != None and self.do_expunge:
  271. # It is faster to do close() than a single
  272. # expunge when we log out (because expunge returns
  273. # a list of all the deleted messages which we don't do
  274. # anything with).
  275. self.close()
  276. self.current_folder = None
  277. if folder == "":
  278. # This is Python bug #845560 - if the empty string is
  279. # passed, we get a traceback, not just an 'invalid folder'
  280. # error, so raise our own error.
  281. raise BadIMAPResponseError("select",
  282. "Cannot have empty string as "
  283. "folder name in select")
  284. # We *always* use SELECT and not EXAMINE, because this
  285. # speeds things up considerably.
  286. response = self.select(folder, None)
  287. data = self.check_response("select %s" % (folder,), response)
  288. self.current_folder = folder
  289. return data
  290. number_re = re.compile(r"{\d+}")
  291. folder_re = re.compile(r"\(([\w\\ ]*)\) ")
  292. def folder_list(self):
  293. """Return a alphabetical list of all folders available on the
  294. server."""
  295. response = self.list()
  296. try:
  297. all_folders = self.check_response("list", response)
  298. except BadIMAPResponseError:
  299. # We want to keep going, so just print out a warning, and
  300. # return an empty list.
  301. if options["globals", "verbose"]:
  302. print >> sys.stderr, "Could not retrieve folder list."
  303. return []
  304. folders = []
  305. for fol in all_folders:
  306. # Sigh. Some servers may give us back the folder name as a
  307. # literal, so we need to crunch this out.
  308. if isinstance(fol, types.TupleType):
  309. m = self.number_re.search(fol[0])
  310. if not m:
  311. # Something is wrong here! Skip this folder.
  312. continue
  313. fol = '%s"%s"' % (fol[0][:m.start()], fol[1])
  314. m = self.folder_re.search(fol)
  315. if not m:
  316. # Something is not good with this folder, so skip it.
  317. continue
  318. name_attributes = fol[:m.end()-1]
  319. # IMAP is a truly odd protocol. The delimiter is
  320. # only the delimiter for this particular folder - each
  321. # folder *may* have a different delimiter
  322. self.folder_delimiter = fol[m.end()+1:m.end()+2]
  323. # A bit of a hack, but we really need to know if this is
  324. # the case.
  325. if self.folder_delimiter == ',':
  326. print >> sys.stderr, ("WARNING: Your imap server uses a comma as the "
  327. "folder delimiter. This may cause unpredictable " \
  328. "errors.")
  329. folders.append(fol[m.end()+4:].strip('"'))
  330. folders.sort()
  331. return folders
  332. # A flag can have any character in the ascii range 32-126 except for
  333. # (){ %*"\
  334. FLAG_CHARS = ""
  335. for i in range(32, 127):
  336. if not chr(i) in ['(', ')', '{', ' ', '%', '*', '"', '\\']:
  337. FLAG_CHARS += chr(i)
  338. FLAG = r"\\?[" + re.escape(FLAG_CHARS) + r"]+"
  339. # The empty flag set "()" doesn't match, so that extract_fetch_data()
  340. # returns data["FLAGS"] == None
  341. FLAGS_RE = re.compile(r"(FLAGS) (\((" + FLAG + r" )*(" + FLAG + r")\))")
  342. INTERNALDATE_RE = re.compile(r"(INTERNALDATE) (\"\d{1,2}\-[A-Za-z]{3,3}\-" +
  343. r"\d{2,4} \d{2,2}\:\d{2,2}\:\d{2,2} " +
  344. r"[\+\-]\d{4,4}\")")
  345. RFC822_RE = re.compile(r"(RFC822) (\{[\d]+\})")
  346. BODY_PEEK_RE = re.compile(r"(BODY\[\]) (\{[\d]+\})")
  347. RFC822_HEADER_RE = re.compile(r"(RFC822.HEADER) (\{[\d]+\})")
  348. UID_RE = re.compile(r"(UID) ([\d]+)")
  349. UID_RE2 = re.compile(r" *(UID) ([\d]+)\)")
  350. FETCH_RESPONSE_RE = re.compile(r"([0-9]+) \(([" + \
  351. re.escape(FLAG_CHARS) + r"\"\{\}\(\)\\ ]*)\)?")
  352. LITERAL_RE = re.compile(r"^\{[\d]+\}$")
  353. def _extract_fetch_data(self, response):
  354. """This does the real work of extracting the data, for each message
  355. number.
  356. """
  357. # We support the following FETCH items:
  358. # FLAGS
  359. # INTERNALDATE
  360. # RFC822
  361. # UID
  362. # RFC822.HEADER
  363. # BODY.PEEK
  364. # All others are ignored.
  365. if isinstance(response, types.StringTypes):
  366. response = (response,)
  367. data = {}
  368. expected_literal = None
  369. if self.UID_RE2.match(response[-1]):
  370. response = response[:-1]
  371. for part in response:
  372. # We ignore parentheses by themselves, for convenience.
  373. if part == ')':
  374. continue
  375. if expected_literal:
  376. # This should be a literal of a certain size.
  377. key, expected_size = expected_literal
  378. ## if len(part) != expected_size:
  379. ## raise BadIMAPResponseError(\
  380. ## "FETCH response (wrong size literal %d != %d)" % \
  381. ## (len(part), expected_size), response)
  382. data[key] = part
  383. expected_literal = None
  384. continue
  385. # The first item will always be the message number.
  386. mo = self.FETCH_RESPONSE_RE.match(part)
  387. if mo:
  388. data["message_number"] = mo.group(1)
  389. rest = mo.group(2)
  390. else:
  391. raise BadIMAPResponseError("FETCH response", response)
  392. for r in [self.FLAGS_RE, self.INTERNALDATE_RE, self.RFC822_RE,
  393. self.UID_RE, self.RFC822_HEADER_RE, self.BODY_PEEK_RE]:
  394. mo = r.search(rest)
  395. if mo is not None:
  396. if self.LITERAL_RE.match(mo.group(2)):
  397. # The next element will be a literal.
  398. expected_literal = (mo.group(1),
  399. int(mo.group(2)[1:-1]))
  400. else:
  401. data[mo.group(1)] = mo.group(2)
  402. return data
  403. def extract_fetch_data(self, response):
  404. """Extract data from the response given to an IMAP FETCH command.
  405. The data is put into a dictionary, which is returned, where the
  406. keys are the fetch items.
  407. """
  408. # There may be more than one message number in the response, so
  409. # handle separately.
  410. if isinstance(response, types.StringTypes):
  411. response = (response,)
  412. data = {}
  413. for msg in response:
  414. msg_data = self._extract_fetch_data(msg)
  415. if msg_data:
  416. # Maybe there are two about the same message number!
  417. num = msg_data["message_number"]
  418. if num in data:
  419. data[num].update(msg_data)
  420. else:
  421. data[num] = msg_data
  422. return data
  423. # Maximum amount of data that will be read at any one time.
  424. MAXIMUM_SAFE_READ = 4096
  425. def safe_read(self, size):
  426. """Read data from remote, but in manageable sizes."""
  427. data = []
  428. while size > 0:
  429. if size < self.MAXIMUM_SAFE_READ:
  430. to_collect = size
  431. else:
  432. to_collect = self.MAXIMUM_SAFE_READ
  433. data.append(self._read(to_collect))
  434. size -= self.MAXIMUM_SAFE_READ
  435. return "".join(data)
  436. class IMAPMessage(message.SBHeaderMessage):
  437. def __init__(self):
  438. message.SBHeaderMessage.__init__(self)
  439. self.folder = None
  440. self.previous_folder = None
  441. self.rfc822_command = "(BODY.PEEK[])"
  442. self.rfc822_key = "BODY[]"
  443. self.got_substance = False
  444. self.invalid = False
  445. self.could_not_retrieve = False
  446. self.imap_server = None
  447. def extractTime(self):
  448. """When we create a new copy of a message, we need to specify
  449. a timestamp for the message, if we can't get the information
  450. from the IMAP server itself. If the message has a valid date
  451. header we use that. Otherwise, we use the current time."""
  452. message_date = self["Date"]
  453. if message_date is not None:
  454. parsed_date = parsedate(message_date)
  455. if parsed_date is not None:
  456. try:
  457. return Time2Internaldate(time.mktime(parsed_date))
  458. except ValueError:
  459. # Invalid dates can cause mktime() to raise a
  460. # ValueError, for example:
  461. # >>> time.mktime(parsedate("Mon, 06 May 0102 10:51:16 -0100"))
  462. # Traceback (most recent call last):
  463. # File "<interactive input>", line 1, in ?
  464. # ValueError: year out of range
  465. # (Why this person is getting mail from almost two
  466. # thousand years ago is another question <wink>).
  467. # In any case, we just pass and use the current date.
  468. pass
  469. except OverflowError:
  470. pass
  471. return Time2Internaldate(time.time())
  472. def get_full_message(self):
  473. """Retrieve the RFC822 message from the IMAP server and return a
  474. new IMAPMessage object that has the same details as this message,
  475. but also has the substance."""
  476. if self.got_substance:
  477. return self
  478. assert self.id, "Cannot get substance of message without an id"
  479. assert self.uid, "Cannot get substance of message without an UID"
  480. assert self.imap_server, "Cannot do anything without IMAP connection"
  481. # First, try to select the folder that the message is in.
  482. try:
  483. self.imap_server.SelectFolder(self.folder.name)
  484. except BadIMAPResponseError:
  485. # Can't select the folder, so getting the substance will not
  486. # work.
  487. self.could_not_retrieve = True
  488. print >> sys.stderr, "Could not select folder %s for message " \
  489. "%s (uid %s)" % (self.folder.name, self.id, self.uid)
  490. return self
  491. # Now try to fetch the substance of the message.
  492. try:
  493. response = self.imap_server.uid("FETCH", self.uid,
  494. self.rfc822_command)
  495. except MemoryError:
  496. # Really big messages can trigger a MemoryError here.
  497. # The problem seems to be line 311 (Python 2.3) of socket.py,
  498. # which has "return "".join(buffers)". This has also caused
  499. # problems with Mac OS X 10.3, which apparently is very stingy
  500. # with memory (the malloc calls fail!). The problem then is
  501. # line 301 of socket.py which does
  502. # "data = self._sock.recv(recv_size)".
  503. # We want to handle this gracefully, although we can't really
  504. # do what we do later, and rewrite the message, since we can't
  505. # load it in the first place. Maybe an elegant solution would
  506. # be to get the message in parts, or just use the first X
  507. # characters for classification. For now, we just carry on,
  508. # warning the user and ignoring the message.
  509. self.could_not_retrieve = True
  510. print >> sys.stderr, "MemoryError with message %s (uid %s)" % \
  511. (self.id, self.uid)
  512. return self
  513. command = "uid fetch %s" % (self.uid,)
  514. response_data = self.imap_server.check_response(command, response)
  515. data = self.imap_server.extract_fetch_data(response_data)
  516. # The data will be a dictionary - hopefully with only one element,
  517. # but maybe more than one. The key is the message number, which we
  518. # do not have (we use the UID instead). So we look through the
  519. # message and use the first data of the right type we find.
  520. rfc822_data = None
  521. for msg_data in data.itervalues():
  522. if self.rfc822_key in msg_data:
  523. rfc822_data = msg_data[self.rfc822_key]
  524. break
  525. if rfc822_data is None:
  526. raise BadIMAPResponseError("FETCH response", response_data)
  527. try:
  528. new_msg = email.message_from_string(rfc822_data, IMAPMessage)
  529. # We use a general 'except' because the email package doesn't
  530. # always return email.Errors (it can return a TypeError, for
  531. # example) if the email is invalid. In any case, we want
  532. # to keep going, and not crash, because we might leave the
  533. # user's mailbox in a bad state if we do. Better to soldier on.
  534. except:
  535. # Yikes! Barry set this to return at this point, which
  536. # would work ok for training (IIRC, that's all he's
  537. # using it for), but for filtering, what happens is that
  538. # the message ends up blank, but ok, so the original is
  539. # flagged to be deleted, and a new (almost certainly
  540. # unsure) message, *with only the spambayes headers* is
  541. # created. The nice solution is still to do what sb_server
  542. # does and have a X-Spambayes-Exception header with the
  543. # exception data and then the original message.
  544. self.invalid = True
  545. text, details = message.insert_exception_header(
  546. rfc822_data, self.id)
  547. self.invalid_content = text
  548. self.got_substance = True
  549. # Print the exception and a traceback.
  550. print >> sys.stderr, details
  551. return self
  552. new_msg.folder = self.folder
  553. new_msg.previous_folder = self.previous_folder
  554. new_msg.rfc822_command = self.rfc822_command
  555. new_msg.rfc822_key = self.rfc822_key
  556. new_msg.imap_server = self.imap_server
  557. new_msg.uid = self.uid
  558. new_msg.setId(self.id)
  559. new_msg.got_substance = True
  560. if not new_msg.has_key(options["Headers", "mailid_header_name"]):
  561. new_msg[options["Headers", "mailid_header_name"]] = self.id
  562. if options["globals", "verbose"]:
  563. sys.stdout.write(chr(8) + "*")
  564. return new_msg
  565. def MoveTo(self, dest):
  566. '''Note that message should move to another folder. No move is
  567. carried out until Save() is called, for efficiency.'''
  568. if self.previous_folder is None:
  569. self.previous_folder = self.folder
  570. self.folder = dest
  571. def as_string(self, unixfrom=False):
  572. # Basically the same as the parent class's except that we handle
  573. # the case where the data was unparsable, so we haven't done any
  574. # filtering, and we are not actually a proper email.Message object.
  575. # We also don't mangle the from line; the server must take care of
  576. # this.
  577. if self.invalid:
  578. return self._force_CRLF(self.invalid_content)
  579. else:
  580. return message.SBHeaderMessage.as_string(self, unixfrom,
  581. mangle_from_=False)
  582. recent_re = re.compile(r"\\Recent ?| ?\\Recent")
  583. def Save(self):
  584. """Save message to IMAP server.
  585. We can't actually update the message with IMAP, so what we do is
  586. create a new message and delete the old one."""
  587. assert self.folder is not None, \
  588. "Can't save a message that doesn't have a folder."
  589. assert self.id, "Can't save a message that doesn't have an id."
  590. assert self.imap_server, "Can't do anything without IMAP connection."
  591. response = self.imap_server.uid("FETCH", self.uid,
  592. "(FLAGS INTERNALDATE)")
  593. command = "fetch %s (flags internaldate)" % (self.uid,)
  594. response_data = self.imap_server.check_response(command, response)
  595. data = self.imap_server.extract_fetch_data(response_data)
  596. # The data will be a dictionary - hopefully with only one element,
  597. # but maybe more than one. The key is the message number, which we
  598. # do not have (we use the UID instead). So we look through the
  599. # message and use the last data of the right type we find.
  600. msg_time = self.extractTime()
  601. flags = None
  602. for msg_data in data.itervalues():
  603. if "INTERNALDATE" in msg_data:
  604. msg_time = msg_data["INTERNALDATE"]
  605. if "FLAGS" in msg_data:
  606. flags = msg_data["FLAGS"]
  607. # The \Recent flag can be fetched, but cannot be stored
  608. # We must remove it from the list if it is there.
  609. flags = self.recent_re.sub("", flags)
  610. # We try to save with flags and time, then with just the
  611. # time, then with the flags and the current time, then with just
  612. # the current time. The first should work, but the first three
  613. # sometimes (due to the quirky IMAP server) fail.
  614. for flgs, tme in [(flags, msg_time),
  615. (None, msg_time),
  616. (flags, Time2Internaldate(time.time())),
  617. (None, Time2Internaldate(time.time()))]:
  618. try:
  619. response = self.imap_server.append(self.folder.name, flgs, tme,
  620. self.as_string())
  621. except BaseIMAP.error:
  622. continue
  623. try:
  624. self.imap_server.check_response("", response)
  625. except BadIMAPResponseError:
  626. pass
  627. else:
  628. break
  629. else:
  630. command = "append %s %s %s %s" % (self.folder.name, flgs, tme,
  631. self.as_string)
  632. raise BadIMAPResponseError(command)
  633. if self.previous_folder is None:
  634. self.imap_server.SelectFolder(self.folder.name)
  635. else:
  636. self.imap_server.SelectFolder(self.previous_folder.name)
  637. self.previous_folder = None
  638. response = self.imap_server.uid("STORE", self.uid, "+FLAGS.SILENT",
  639. "(\\Deleted \\Seen)")
  640. command = "set %s to be deleted and seen" % (self.uid,)
  641. self.imap_server.check_response(command, response)
  642. # Not all IMAP servers immediately offer the new message, but
  643. # we need to find it to get the new UID. We need to wait until
  644. # the server offers up an EXISTS command, so we no-op until that
  645. # is the case.
  646. # See [ 941596 ] sb_imapfilter.py not adding headers / moving messages
  647. # We use the recent() function, which no-ops if necessary. We try
  648. # 100 times, and then give up. If a message arrives independantly,
  649. # and we are told about it before our message, then this could
  650. # cause trouble, but that would be one weird server.
  651. for i in xrange(100):
  652. response = self.imap_server.recent()
  653. data = self.imap_server.check_response("recent", response)
  654. if data[0] is not None:
  655. if options["globals", "verbose"]:
  656. print >> sys.stderr, "[imapfilter] found saved message", self.uid,
  657. print >> sys.stderr, "in iteration", i
  658. break
  659. else:
  660. if options["globals", "verbose"]:
  661. print >> sys.stderr, ("[imapfilter] can't find saved message after"
  662. "100 iterations:"), self.uid
  663. # raise BadIMAPResponseError("recent", "Cannot find saved message")
  664. # We need to update the UID, as it will have changed.
  665. # Although we don't use the UID to keep track of messages, we do
  666. # have to use it for IMAP operations.
  667. self.imap_server.SelectFolder(self.folder.name)
  668. search_string = "(UNDELETED HEADER %s \"%s\")" % \
  669. (options["Headers", "mailid_header_name"],
  670. self.id.replace('\\',r'\\').replace('"',r'\"'))
  671. response = self.imap_server.uid("SEARCH", search_string)
  672. data = self.imap_server.check_response("search " + search_string,
  673. response)
  674. new_id = data[0]
  675. # See [ 870799 ] imap trying to fetch invalid message UID
  676. # It seems that although the save gave a "NO" response to the
  677. # first save, the message was still saved (without the flags,
  678. # probably). This really isn't good behaviour on the server's
  679. # part, but, as usual, we try and deal with it. So, if we get
  680. # more than one undeleted message with the same SpamBayes id,
  681. # delete all of them apart from the last one, and use that.
  682. multiple_ids = new_id.split()
  683. for id_to_remove in multiple_ids[:-1]:
  684. response = self.imap_server.uid("STORE", id_to_remove,
  685. "+FLAGS.SILENT",
  686. "(\\Deleted \\Seen)")
  687. command = "silently delete and make seen %s" % (id_to_remove,)
  688. self.imap_server.check_response(command, response)
  689. if multiple_ids:
  690. new_id = multiple_ids[-1]
  691. else:
  692. # Let's hope it doesn't, but, just in case, if the search
  693. # turns up empty, we make the assumption that the new message
  694. # is the last one with a recent flag.
  695. response = self.imap_server.uid("SEARCH", "RECENT")
  696. data = self.imap_server.check_response("search recent",
  697. response)
  698. new_id = data[0]
  699. if new_id.find(' ') > -1:
  700. ids = new_id.split(' ')
  701. new_id = ids[-1]
  702. # Ok, now we're in trouble if we still haven't found it.
  703. # We make a huge assumption that the new message is the one
  704. # with the highest UID (they are sequential, so this will be
  705. # ok as long as another message hasn't also arrived).
  706. if new_id == "":
  707. response = self.imap_server.uid("SEARCH", "ALL")
  708. data = self.imap_server.check_response("search all",
  709. response)
  710. new_id = data[0]
  711. if new_id.find(' ') > -1:
  712. ids = new_id.split(' ')
  713. new_id = ids[-1]
  714. self.uid = new_id
  715. class IMAPFolder(object):
  716. def __init__(self, folder_name, imap_server, stats):
  717. self.name = folder_name
  718. self.imap_server = imap_server
  719. self.stats = stats
  720. # Unique names for cached messages - see _generate_id below.
  721. self.lastBaseMessageName = ''
  722. self.uniquifier = 2
  723. def __cmp__(self, obj):
  724. """Two folders are equal if their names are equal."""
  725. if obj is None:
  726. return False
  727. return cmp(self.name, obj.name)
  728. def __iter__(self):
  729. """Iterate through the messages in this IMAP folder."""
  730. for key in self.keys():
  731. yield self[key]
  732. def keys(self):
  733. '''Returns *uids* for all the messages in the folder not
  734. marked as deleted.'''
  735. self.imap_server.SelectFolder(self.name)
  736. response = self.imap_server.uid("SEARCH", "UNDELETED")
  737. data = self.imap_server.check_response("search undeleted", response)
  738. if data[0]:
  739. return data[0].split(' ')
  740. else:
  741. return []
  742. custom_header_id_re = re.compile(re.escape(\
  743. options["Headers", "mailid_header_name"]) + "\:\s*(\d+(?:\-\d)?)",
  744. re.IGNORECASE)
  745. message_id_re = re.compile("Message-ID\: ?\<([^\n\>]+)\>",
  746. re.IGNORECASE)
  747. def __getitem__(self, key):
  748. """Return message matching the given *uid*.
  749. The messages returned have no substance (so this should be
  750. reasonably quick, even with large messages). You need to call
  751. get_full_message() on the returned message to get the substance of
  752. the message from the server."""
  753. self.imap_server.SelectFolder(self.name)
  754. # Using RFC822.HEADER.LINES would be better here, but it seems
  755. # that not all servers accept it, even though it is in the RFC
  756. response = self.imap_server.uid("FETCH", key, "RFC822.HEADER")
  757. response_data = self.imap_server.check_response(\
  758. "fetch %s rfc822.header" % (key,), response)
  759. data = self.imap_server.extract_fetch_data(response_data)
  760. # The data will be a dictionary - hopefully with only one element,
  761. # but maybe more than one. The key is the message number, which we
  762. # do not have (we use the UID instead). So we look through the
  763. # message and use the first data of the right type we find.
  764. headers = None
  765. for msg_data in data.itervalues():
  766. if "RFC822.HEADER" in msg_data:
  767. headers = msg_data["RFC822.HEADER"]
  768. break
  769. if headers is None:
  770. raise BadIMAPResponseError("FETCH response", response_data)
  771. # Create a new IMAPMessage object, which will be the return value.
  772. msg = IMAPMessage()
  773. msg.folder = self
  774. msg.uid = key
  775. msg.imap_server = self.imap_server
  776. # We use the MessageID header as the ID for the message, as long
  777. # as it is available, and if not, we add our own.
  778. # Search for our custom id first, for backwards compatibility.
  779. for id_header_re in [self.custom_header_id_re, self.message_id_re]:
  780. mo = id_header_re.search(headers)
  781. if mo:
  782. msg.setId(mo.group(1))
  783. break
  784. else:
  785. newid = self._generate_id()
  786. if options["globals", "verbose"]:
  787. print >> sys.stderr, "[imapfilter] saving", msg.uid, "with new id:", newid
  788. msg.setId(newid)
  789. # Unfortunately, we now have to re-save this message, so that
  790. # our id is stored on the IMAP server. The vast majority of
  791. # messages have Message-ID headers, from what I can tell, so
  792. # we should only rarely have to do this. It's less often than
  793. # with the previous solution, anyway!
  794. # msg = msg.get_full_message()
  795. # msg.Save()
  796. if options["globals", "verbose"]:
  797. sys.stdout.write(".")
  798. return msg
  799. # Lifted straight from sb_server.py (under the name getNewMessageName)
  800. def _generate_id(self):
  801. # The message id is the time it arrived, with a uniquifier
  802. # appended if two arrive within one clock tick of each other.
  803. messageName = "%10.10d" % long(time.time())
  804. if messageName == self.lastBaseMessageName:
  805. messageName = "%s-%d" % (messageName, self.uniquifier)
  806. self.uniquifier += 1
  807. else:
  808. self.lastBaseMessageName = messageName
  809. self.uniquifier = 2
  810. return messageName
  811. def Train(self, classifier, isSpam):
  812. """Train folder as spam/ham."""
  813. num_trained = 0
  814. for msg in self:
  815. if msg.GetTrained() == (not isSpam):
  816. msg = msg.get_full_message()
  817. if msg.could_not_retrieve:
  818. # Something went wrong, and we couldn't even get
  819. # an invalid message, so just skip this one.
  820. # Annoyingly, we'll try to do it every time the
  821. # script runs, but hopefully the user will notice
  822. # the errors and move it soon enough.
  823. continue
  824. msg.delSBHeaders()
  825. classifier.unlearn(msg.tokenize(), not isSpam)
  826. if isSpam:
  827. old_class = options["Headers", "header_ham_string"]
  828. else:
  829. old_class = options["Headers", "header_spam_string"]
  830. # Once the message has been untrained, it's training memory
  831. # should reflect that on the off chance that for some
  832. # reason the training breaks.
  833. msg.RememberTrained(None)
  834. else:
  835. old_class = None
  836. if msg.GetTrained() is None:
  837. msg = msg.get_full_message()
  838. if msg.could_not_retrieve:
  839. continue
  840. saved_headers = msg.currentSBHeaders()
  841. msg.delSBHeaders()
  842. classifier.learn(msg.tokenize(), isSpam)
  843. num_trained += 1
  844. msg.RememberTrained(isSpam)
  845. self.stats.RecordTraining(not isSpam, old_class=old_class)
  846. if isSpam:
  847. move_opt_name = "move_trained_spam_to_folder"
  848. else:
  849. move_opt_name = "move_trained_ham_to_folder"
  850. if options["imap", move_opt_name] != "":
  851. # We need to restore the SpamBayes headers.
  852. for header, value in saved_headers.items():
  853. msg[header] = value
  854. msg.MoveTo(IMAPFolder(options["imap", move_opt_name],
  855. self.imap_server, self.stats))
  856. msg.Save()
  857. return num_trained
  858. def Filter(self, classifier, spamfolder, unsurefolder, hamfolder):
  859. count = {}
  860. count["ham"] = 0
  861. count["spam"] = 0
  862. count["unsure"] = 0
  863. for msg in self:
  864. cls = msg.GetClassification()
  865. if cls is None or hamfolder is not None:
  866. if options["globals", "verbose"]:
  867. print >> sys.stderr, "[imapfilter] classified as %s:" % cls, msg.uid
  868. msg = msg.get_full_message()
  869. if msg.could_not_retrieve:
  870. # Something went wrong, and we couldn't even get
  871. # an invalid message, so just skip this one.
  872. # Annoyingly, we'll try to do it every time the
  873. # script runs, but hopefully the user will notice
  874. # the errors and move it soon enough.
  875. if options["globals", "verbose"]:
  876. print >> sys.stderr, "[imapfilter] could not retrieve:", msg.uid
  877. continue
  878. (prob, clues) = classifier.spamprob(msg.tokenize(),
  879. evidence=True)
  880. # Add headers and remember classification.
  881. msg.delSBHeaders()
  882. msg.addSBHeaders(prob, clues)
  883. self.stats.RecordClassification(prob)
  884. cls = msg.GetClassification()
  885. if cls == options["Headers", "header_ham_string"]:
  886. if hamfolder:
  887. if options["globals", "verbose"]:
  888. print >> sys.stderr, "[imapfilter] moving to ham folder:",
  889. print >> sys.stderr, msg.uid
  890. msg.MoveTo(hamfolder)
  891. # Otherwise, we leave ham alone.
  892. count["ham"] += 1
  893. elif cls == options["Headers", "header_spam_string"]:
  894. if options["globals", "verbose"]:
  895. print >> sys.stderr, "[imapfilter] moving to spam folder:",
  896. print >> sys.stderr, msg.uid
  897. msg.MoveTo(spamfolder)
  898. count["spam"] += 1
  899. else:
  900. if options["globals", "verbose"]:
  901. print >> sys.stderr, "[imapfilter] moving to unsure folder:", msg.uid
  902. msg.MoveTo(unsurefolder)
  903. count["unsure"] += 1
  904. msg.Save()
  905. else:
  906. if options["globals", "verbose"]:
  907. print >> sys.stderr, "[imapfilter] already classified:", msg.uid
  908. return count
  909. class IMAPFilter(object):
  910. def __init__(self, classifier, stats):
  911. self.spam_folder = None
  912. self.unsure_folder = None
  913. self.ham_folder = None
  914. self.classifier = classifier
  915. self.imap_server = None
  916. self.stats = stats
  917. def Train(self):
  918. assert self.imap_server, "Cannot do anything without IMAP server."
  919. if options["globals", "verbose"]:
  920. t = time.time()
  921. total_trained = 0
  922. for is_spam, option_name in [(False, "ham_train_folders"),
  923. (True, "spam_train_folders")]:
  924. training_folders = options["imap", option_name]
  925. for fol in training_folders:
  926. # Select the folder to make sure it exists
  927. try:
  928. self.imap_server.SelectFolder(fol)
  929. except BadIMAPResponseError:
  930. print >> sys.stderr, "Skipping", fol, "as it cannot be selected."
  931. continue
  932. if options['globals', 'verbose']:
  933. print >> sys.stderr, (" Training %s folder %s" %
  934. (["ham", "spam"][is_spam], fol))
  935. folder = IMAPFolder(fol, self.imap_server, self.stats)
  936. num_trained = folder.Train(self.classifier, is_spam)
  937. total_trained += num_trained
  938. if options['globals', 'verbose']:
  939. print >> sys.stderr, "\n ", num_trained, "trained."
  940. if total_trained:
  941. self.classifier.store()
  942. if options["globals", "verbose"]:
  943. print >> sys.stderr, ("Training took %.4f seconds, %s messages were trained."
  944. % (time.time() - t, total_trained))
  945. def Filter(self):
  946. assert self.imap_server, "Cannot do anything without IMAP server."
  947. if not self.spam_folder:
  948. spam_folder_name = options["imap", "spam_folder"]
  949. if options["globals", "verbose"]:
  950. print >> sys.stderr, "[imapfilter] spam folder:", spam_folder_name
  951. self.spam_folder = IMAPFolder(
  952. spam_folder_name, self.imap_server, self.stats)
  953. if not self.unsure_folder:
  954. unsure_folder_name = options["imap", "unsure_folder"]
  955. if options["globals", "verbose"]:
  956. print >> sys.stderr, "[imapfilter] unsure folder:", unsure_folder_name
  957. self.unsure_folder = IMAPFolder(
  958. unsure_folder_name, self.imap_server, self.stats)
  959. ham_folder_name = options["imap", "ham_folder"]
  960. if options["globals", "verbose"]:
  961. print >> sys.stderr, "[imapfilter] ham folder:", ham_folder_name
  962. if ham_folder_name and not self.ham_folder:
  963. self.ham_folder = IMAPFolder(ham_folder_name, self.imap_server,
  964. self.stats)
  965. if options["globals", "verbose"]:
  966. t = time.time()
  967. count = {}
  968. count["ham"] = 0
  969. count["spam"] = 0
  970. count["unsure"] = 0
  971. # Select the ham, spam and unsure folders to make sure they exist.
  972. try:
  973. self.imap_server.SelectFolder(self.spam_folder.name)
  974. except BadIMAPResponseError:
  975. print >> sys.stderr, "Cannot select spam folder. Please check configuration."
  976. sys.exit(-1)
  977. try:
  978. self.imap_server.SelectFolder(self.unsure_folder.name)
  979. except BadIMAPResponseError:
  980. print >> sys.stderr, "Cannot select unsure folder. Please check configuration."
  981. sys.exit(-1)
  982. if self.ham_folder:
  983. try:
  984. self.imap_server.SelectFolder(self.ham_folder.name)
  985. except BadIMAPResponseError:
  986. print >> sys.stderr, "Cannot select ham folder. Please check configuration."
  987. sys.exit(-1)
  988. for filter_folder in options["imap", "filter_folders"]:
  989. # Select the folder to make sure it exists.
  990. try:
  991. self.imap_server.SelectFolder(filter_folder)
  992. except BadIMAPResponseError:
  993. print >> sys.stderr, "Cannot select", filter_folder, "... skipping."
  994. continue
  995. folder = IMAPFolder(filter_folder, self.imap_server, self.stats)
  996. subcount = folder.Filter(self.classifier, self.spam_folder,
  997. self.unsure_folder, self.ham_folder)
  998. for key in count.keys():
  999. count[key] += subcount.get(key, 0)
  1000. if options["globals", "verbose"]:
  1001. if count is not None:
  1002. print >> sys.stderr, ("\nClassified %s ham, %s spam, and %s unsure." %
  1003. (count["ham"], count["spam"], count["unsure"]))
  1004. print >> sys.stderr, "Classifying took %.4f seconds." % (time.time() - t,)
  1005. def servers(promptForPass = False):
  1006. """Returns a list containing a tuple (server,user,passwd) for each IMAP server in options.
  1007. If promptForPass is True or at least on password is missing from options,
  1008. prompts the user for each server's password.
  1009. """
  1010. servers = options["imap", "server"]
  1011. usernames = options["imap", "username"]
  1012. pwds = options["imap", "password"]
  1013. if promptForPass or len(pwds) < len(usernames):
  1014. pwds = []
  1015. for u in usernames:
  1016. pwds.append(getpass("Enter password for %s:" % (u,)))
  1017. return zip(servers, usernames, pwds)
  1018. def run(force_UI=False):
  1019. try:
  1020. opts, args = getopt.getopt(sys.argv[1:], 'hbPtcvl:e:i:d:p:o:',

Large files files are truncated, but you can click here to view the full file