/unladen_swallow/lib/spambayes/scripts/sb_imapfilter.py
Python | 1324 lines | 1257 code | 11 blank | 56 comment | 15 complexity | c0543bea710b8f588d097fea5cde8098 MD5 | raw file
Possible License(s): BSD-3-Clause, Apache-2.0, GPL-2.0
- #!/usr/bin/env python
- """An IMAP filter. An IMAP message box is scanned and all non-scored
- messages are scored and (where necessary) filtered.
- Usage:
- sb_imapfilter [options]
- note: option values with spaces in them must be enclosed
- in double quotes
- options:
- -p dbname : pickled training database filename
- -d dbname : dbm training database filename
- -t : train contents of spam folder and ham folder
- -c : classify inbox
- -h : display this message
- -v : verbose mode
- -P : security option to prompt for imap password,
- rather than look in options["imap", "password"]
- -e y/n : expunge/purge messages on exit (y) or not (n)
- -i debuglvl : a somewhat mysterious imaplib debugging level
- (4 is a good level, and suitable for bug reports)
- -l minutes : period of time between filtering operations
- -b : Launch a web browser showing the user interface.
- -o section:option:value :
- set [section, option] in the options database
- to value
- Examples:
- Classify inbox, with dbm database
- sb_imapfilter -c -d bayes.db
- Train Spam and Ham, then classify inbox, with dbm database
- sb_imapfilter -t -c -d bayes.db
- Train Spam and Ham only, with pickled database
- sb_imapfilter -t -p bayes.db
- Warnings:
- o We never delete mail, unless you use the -e/purge option, but we do
- mark a lot as deleted, and your mail client might remove that for
- you. We try to only mark as deleted once the moved/altered message
- is correctly saved, but things might go wrong. We *strongly*
- recommend that you try this script out on mail that you can recover
- from somewhere else, at least at first.
- """
- from __future__ import generators
- todo = """
- o IMAP supports authentication via other methods than the plain-text
- password method that we are using at the moment. Neither of the
- servers I have access to offer any alternative method, however. If
- someone's does, then it would be nice to offer this.
- Thanks to #1169939 we now support CRAM_MD5 if available. It'd still
- be good to support others, though.
- o Usernames should be able to be literals as well as quoted strings.
- This might help if the username/password has special characters like
- accented characters.
- o Suggestions?
- """
- # This module is part of the SpamBayes project, which is Copyright 2002-2007
- # The Python Software Foundation and is covered by the Python Software
- # Foundation license.
- __author__ = "Tony Meyer <ta-meyer@ihug.co.nz>, Tim Stone"
- __credits__ = "All the SpamBayes folk. The original filter design owed " \
- "much to isbg by Roger Binns (http://www.rogerbinns.com/isbg)."
- # If we are running as a frozen application, then chances are that
- # output is just lost. We'd rather log this, like sb_server and Oulook
- # log, so that the user can pull up the output if possible. We could just
- # rely on the user piping the output appropriately, but would rather have
- # more control. The sb_server tray application only does this if not
- # running in a console window, but we do it whenever we are frozen.
- import os
- import sys
- if hasattr(sys, "frozen"):
- # We want to move to logging module later, so for now, we
- # hack together a simple logging strategy.
- try:
- import win32api
- except ImportError:
- if sys.platform == "win32":
- # Fall back to CWD, but warn user.
- status = "Warning: your log is stored in the current " \
- "working directory. We recommend installing " \
- "the pywin32 extensions, so that the log is " \
- "stored in the Windows temp directory."
- temp_dir = os.getcwd()
- else:
- # Try for a /tmp directory.
- if os.path.isdir("/tmp"):
- temp_dir = "/tmp"
- status = "Log file opened in /tmp"
- else:
- status = "Warning: your log is stored in the current " \
- "working directory. If this does not suit you " \
- "please let the spambayes@python.org crowd know " \
- "so that an alternative can be arranged."
- else:
- temp_dir = win32api.GetTempPath()
- status = "Log file opened in " + temp_dir
- for i in range(3, 0, -1):
- try:
- os.unlink(os.path.join(temp_dir, "SpamBayesIMAP%d.log" % (i+1)))
- except os.error:
- pass
- try:
- os.rename(
- os.path.join(temp_dir, "SpamBayesIMAP%d.log" % i),
- os.path.join(temp_dir, "SpamBayesIMAP%d.log" % (i+1))
- )
- except os.error:
- pass
- # Open this log, as unbuffered, so crashes still get written.
- sys.stdout = open(os.path.join(temp_dir,"SpamBayesIMAP1.log"), "wt", 0)
- sys.stderr = sys.stdout
- import socket
- import re
- import time
- import getopt
- import types
- import thread
- import email
- import email.Parser
- from getpass import getpass
- from email.Utils import parsedate
- from spambayes import Stats
- from spambayes import message
- from spambayes.Options import options, optionsPathname
- from spambayes import storage, Dibbler
- from spambayes.UserInterface import UserInterfaceServer
- from spambayes.ImapUI import IMAPUserInterface, LoginFailure
- from spambayes.Version import get_current_version
- from imaplib import IMAP4
- from imaplib import Time2Internaldate
- try:
- if options["imap", "use_ssl"]:
- from imaplib import IMAP4_SSL as BaseIMAP
- else:
- from imaplib import IMAP4 as BaseIMAP
- except ImportError:
- from imaplib import IMAP4 as BaseIMAP
- class BadIMAPResponseError(Exception):
- """An IMAP command returned a non-"OK" response."""
- def __init__(self, command, response):
- self.command = command
- self.response = response
- def __str__(self):
- return "The command '%s' failed to give an OK response.\n%s" % \
- (self.command, self.response)
- class IMAPSession(BaseIMAP):
- '''A class extending the IMAP4 class, with a few optimizations'''
- timeout = 60 # seconds
- def __init__(self, server, debug=0, do_expunge = options["imap", "expunge"] ):
- if ":" in server:
- server, port = server.split(':', 1)
- port = int(port)
- else:
- if options["imap", "use_ssl"]:
- port = 993
- else:
- port = 143
- # There's a tricky situation where if use_ssl is False, but we
- # try to connect to a IMAP over SSL server, we will just hang
- # forever, waiting for a response that will never come. To
- # get past this, just for the welcome message, we install a
- # timeout on the connection. Normal service is then returned.
- # This only applies when we are not using SSL.
- if not hasattr(self, "ssl"):
- readline = self.readline
- self.readline = self.readline_timeout
- try:
- BaseIMAP.__init__(self, server, port)
- except (BaseIMAP.error, socket.gaierror, socket.error):
- if options["globals", "verbose"]:
- print >> sys.stderr, "Cannot connect to server", server, "on port", port
- if not hasattr(self, "ssl"):
- print >> sys.stderr, ("If you are connecting to an SSL server,"
- "please ensure that you\n"
- "have the 'Use SSL' option enabled.")
- self.connected = False
- else:
- self.connected = True
- if not hasattr(self, "ssl"):
- self.readline = readline
- self.debug = debug
- self.do_expunge = do_expunge
- self.server = server
- self.port = port
- self.logged_in = False
- # For efficiency, we remember which folder we are currently
- # in, and only send a select command to the IMAP server if
- # we want to *change* folders. This functionality is used by
- # both IMAPMessage and IMAPFolder.
- self.current_folder = None
- # We override the base read so that we only read a certain amount
- # of data at a time. OS X and Python has problems with getting
- # large amounts of memory at a time, so maybe this will be a way we
- # can work around that (I don't know, and don't have a mac to test,
- # but we need to try something).
- self._read = self.read
- self.read = self.safe_read
- def readline_timeout(self):
- """Read line from remote, possibly timing out."""
- st_time = time.time()
- self.sock.setblocking(False)
- buffer = []
- while True:
- if (time.time() - st_time) > self.timeout:
- if options["globals", "verbose"]:
- print >> sys.stderr, "IMAP Timing out"
- break
- try:
- data = self.sock.recv(1)
- except socket.error, e:
- if e[0] == 10035:
- # Nothing to receive, keep going.
- continue
- raise
- if not data:
- break
- if data == '\n':
- break
- buffer.append(data)
- self.sock.setblocking(True)
- return "".join(buffer)
- def login(self, username, pwd):
- """Log in to the IMAP server, catching invalid username/password."""
- assert self.connected, "Must be connected before logging in."
- if 'AUTH=CRAM-MD5' in self.capabilities:
- login_func = self.login_cram_md5
- args = (username, pwd)
- description = "MD5"
- else:
- login_func = BaseIMAP.login # superclass login
- args = (self, username, pwd)
- description = "plain-text"
- try:
- login_func(*args)
- except BaseIMAP.error, e:
- msg = "The username (%s) and/or password (sent in %s) may " \
- "be incorrect." % (username, description)
- raise LoginFailure(msg)
- self.logged_in = True
- def logout(self):
- """Log off from the IMAP server, possibly expunging.
- Note that most, if not all, of the expunging is probably done in
- SelectFolder, rather than here, for purposes of speed."""
- # We may never have logged in, in which case we do nothing.
- if self.connected and self.logged_in and self.do_expunge:
- # Expunge messages from the ham, spam and unsure folders.
- for fol in ["spam_folder",
- "unsure_folder",
- "ham_folder"]:
- folder_name = options["imap", fol]
- if folder_name:
- self.select(folder_name)
- self.expunge()
- # Expunge messages from the ham and spam training folders.
- for fol_list in ["ham_train_folders",
- "spam_train_folders",]:
- for fol in options["imap", fol_list]:
- self.select(fol)
- self.expunge()
- BaseIMAP.logout(self) # superclass logout
- def check_response(self, command, IMAP_response):
- """A utility function to check the response from IMAP commands.
- Raises BadIMAPResponseError if the response is not OK. Returns
- the data segment of the response otherwise."""
- response, data = IMAP_response
- if response != "OK":
- raise BadIMAPResponseError(command, IMAP_response)
- return data
- def SelectFolder(self, folder):
- """A method to point ensuing IMAP operations at a target folder.
- This is essentially a wrapper around the IMAP select command, which
- ignores the command if the folder is already selected."""
- if self.current_folder != folder:
- if self.current_folder != None and self.do_expunge:
- # It is faster to do close() than a single
- # expunge when we log out (because expunge returns
- # a list of all the deleted messages which we don't do
- # anything with).
- self.close()
- self.current_folder = None
- if folder == "":
- # This is Python bug #845560 - if the empty string is
- # passed, we get a traceback, not just an 'invalid folder'
- # error, so raise our own error.
- raise BadIMAPResponseError("select",
- "Cannot have empty string as "
- "folder name in select")
- # We *always* use SELECT and not EXAMINE, because this
- # speeds things up considerably.
- response = self.select(folder, None)
- data = self.check_response("select %s" % (folder,), response)
- self.current_folder = folder
- return data
- number_re = re.compile(r"{\d+}")
- folder_re = re.compile(r"\(([\w\\ ]*)\) ")
- def folder_list(self):
- """Return a alphabetical list of all folders available on the
- server."""
- response = self.list()
- try:
- all_folders = self.check_response("list", response)
- except BadIMAPResponseError:
- # We want to keep going, so just print out a warning, and
- # return an empty list.
- if options["globals", "verbose"]:
- print >> sys.stderr, "Could not retrieve folder list."
- return []
- folders = []
- for fol in all_folders:
- # Sigh. Some servers may give us back the folder name as a
- # literal, so we need to crunch this out.
- if isinstance(fol, types.TupleType):
- m = self.number_re.search(fol[0])
- if not m:
- # Something is wrong here! Skip this folder.
- continue
- fol = '%s"%s"' % (fol[0][:m.start()], fol[1])
- m = self.folder_re.search(fol)
- if not m:
- # Something is not good with this folder, so skip it.
- continue
- name_attributes = fol[:m.end()-1]
- # IMAP is a truly odd protocol. The delimiter is
- # only the delimiter for this particular folder - each
- # folder *may* have a different delimiter
- self.folder_delimiter = fol[m.end()+1:m.end()+2]
- # A bit of a hack, but we really need to know if this is
- # the case.
- if self.folder_delimiter == ',':
- print >> sys.stderr, ("WARNING: Your imap server uses a comma as the "
- "folder delimiter. This may cause unpredictable " \
- "errors.")
- folders.append(fol[m.end()+4:].strip('"'))
- folders.sort()
- return folders
- # A flag can have any character in the ascii range 32-126 except for
- # (){ %*"\
- FLAG_CHARS = ""
- for i in range(32, 127):
- if not chr(i) in ['(', ')', '{', ' ', '%', '*', '"', '\\']:
- FLAG_CHARS += chr(i)
- FLAG = r"\\?[" + re.escape(FLAG_CHARS) + r"]+"
- # The empty flag set "()" doesn't match, so that extract_fetch_data()
- # returns data["FLAGS"] == None
- FLAGS_RE = re.compile(r"(FLAGS) (\((" + FLAG + r" )*(" + FLAG + r")\))")
- INTERNALDATE_RE = re.compile(r"(INTERNALDATE) (\"\d{1,2}\-[A-Za-z]{3,3}\-" +
- r"\d{2,4} \d{2,2}\:\d{2,2}\:\d{2,2} " +
- r"[\+\-]\d{4,4}\")")
- RFC822_RE = re.compile(r"(RFC822) (\{[\d]+\})")
- BODY_PEEK_RE = re.compile(r"(BODY\[\]) (\{[\d]+\})")
- RFC822_HEADER_RE = re.compile(r"(RFC822.HEADER) (\{[\d]+\})")
- UID_RE = re.compile(r"(UID) ([\d]+)")
- UID_RE2 = re.compile(r" *(UID) ([\d]+)\)")
- FETCH_RESPONSE_RE = re.compile(r"([0-9]+) \(([" + \
- re.escape(FLAG_CHARS) + r"\"\{\}\(\)\\ ]*)\)?")
- LITERAL_RE = re.compile(r"^\{[\d]+\}$")
- def _extract_fetch_data(self, response):
- """This does the real work of extracting the data, for each message
- number.
- """
- # We support the following FETCH items:
- # FLAGS
- # INTERNALDATE
- # RFC822
- # UID
- # RFC822.HEADER
- # BODY.PEEK
- # All others are ignored.
- if isinstance(response, types.StringTypes):
- response = (response,)
- data = {}
- expected_literal = None
- if self.UID_RE2.match(response[-1]):
- response = response[:-1]
-
- for part in response:
- # We ignore parentheses by themselves, for convenience.
- if part == ')':
- continue
- if expected_literal:
- # This should be a literal of a certain size.
- key, expected_size = expected_literal
- ## if len(part) != expected_size:
- ## raise BadIMAPResponseError(\
- ## "FETCH response (wrong size literal %d != %d)" % \
- ## (len(part), expected_size), response)
- data[key] = part
- expected_literal = None
- continue
- # The first item will always be the message number.
- mo = self.FETCH_RESPONSE_RE.match(part)
- if mo:
- data["message_number"] = mo.group(1)
- rest = mo.group(2)
- else:
- raise BadIMAPResponseError("FETCH response", response)
-
- for r in [self.FLAGS_RE, self.INTERNALDATE_RE, self.RFC822_RE,
- self.UID_RE, self.RFC822_HEADER_RE, self.BODY_PEEK_RE]:
- mo = r.search(rest)
- if mo is not None:
- if self.LITERAL_RE.match(mo.group(2)):
- # The next element will be a literal.
- expected_literal = (mo.group(1),
- int(mo.group(2)[1:-1]))
- else:
- data[mo.group(1)] = mo.group(2)
- return data
- def extract_fetch_data(self, response):
- """Extract data from the response given to an IMAP FETCH command.
- The data is put into a dictionary, which is returned, where the
- keys are the fetch items.
- """
- # There may be more than one message number in the response, so
- # handle separately.
- if isinstance(response, types.StringTypes):
- response = (response,)
- data = {}
- for msg in response:
- msg_data = self._extract_fetch_data(msg)
- if msg_data:
- # Maybe there are two about the same message number!
- num = msg_data["message_number"]
- if num in data:
- data[num].update(msg_data)
- else:
- data[num] = msg_data
- return data
- # Maximum amount of data that will be read at any one time.
- MAXIMUM_SAFE_READ = 4096
- def safe_read(self, size):
- """Read data from remote, but in manageable sizes."""
- data = []
- while size > 0:
- if size < self.MAXIMUM_SAFE_READ:
- to_collect = size
- else:
- to_collect = self.MAXIMUM_SAFE_READ
- data.append(self._read(to_collect))
- size -= self.MAXIMUM_SAFE_READ
- return "".join(data)
- class IMAPMessage(message.SBHeaderMessage):
- def __init__(self):
- message.SBHeaderMessage.__init__(self)
- self.folder = None
- self.previous_folder = None
- self.rfc822_command = "(BODY.PEEK[])"
- self.rfc822_key = "BODY[]"
- self.got_substance = False
- self.invalid = False
- self.could_not_retrieve = False
- self.imap_server = None
- def extractTime(self):
- """When we create a new copy of a message, we need to specify
- a timestamp for the message, if we can't get the information
- from the IMAP server itself. If the message has a valid date
- header we use that. Otherwise, we use the current time."""
- message_date = self["Date"]
- if message_date is not None:
- parsed_date = parsedate(message_date)
- if parsed_date is not None:
- try:
- return Time2Internaldate(time.mktime(parsed_date))
- except ValueError:
- # Invalid dates can cause mktime() to raise a
- # ValueError, for example:
- # >>> time.mktime(parsedate("Mon, 06 May 0102 10:51:16 -0100"))
- # Traceback (most recent call last):
- # File "<interactive input>", line 1, in ?
- # ValueError: year out of range
- # (Why this person is getting mail from almost two
- # thousand years ago is another question <wink>).
- # In any case, we just pass and use the current date.
- pass
- except OverflowError:
- pass
- return Time2Internaldate(time.time())
- def get_full_message(self):
- """Retrieve the RFC822 message from the IMAP server and return a
- new IMAPMessage object that has the same details as this message,
- but also has the substance."""
- if self.got_substance:
- return self
- assert self.id, "Cannot get substance of message without an id"
- assert self.uid, "Cannot get substance of message without an UID"
- assert self.imap_server, "Cannot do anything without IMAP connection"
- # First, try to select the folder that the message is in.
- try:
- self.imap_server.SelectFolder(self.folder.name)
- except BadIMAPResponseError:
- # Can't select the folder, so getting the substance will not
- # work.
- self.could_not_retrieve = True
- print >> sys.stderr, "Could not select folder %s for message " \
- "%s (uid %s)" % (self.folder.name, self.id, self.uid)
- return self
- # Now try to fetch the substance of the message.
- try:
- response = self.imap_server.uid("FETCH", self.uid,
- self.rfc822_command)
- except MemoryError:
- # Really big messages can trigger a MemoryError here.
- # The problem seems to be line 311 (Python 2.3) of socket.py,
- # which has "return "".join(buffers)". This has also caused
- # problems with Mac OS X 10.3, which apparently is very stingy
- # with memory (the malloc calls fail!). The problem then is
- # line 301 of socket.py which does
- # "data = self._sock.recv(recv_size)".
- # We want to handle this gracefully, although we can't really
- # do what we do later, and rewrite the message, since we can't
- # load it in the first place. Maybe an elegant solution would
- # be to get the message in parts, or just use the first X
- # characters for classification. For now, we just carry on,
- # warning the user and ignoring the message.
- self.could_not_retrieve = True
- print >> sys.stderr, "MemoryError with message %s (uid %s)" % \
- (self.id, self.uid)
- return self
- command = "uid fetch %s" % (self.uid,)
- response_data = self.imap_server.check_response(command, response)
- data = self.imap_server.extract_fetch_data(response_data)
- # The data will be a dictionary - hopefully with only one element,
- # but maybe more than one. The key is the message number, which we
- # do not have (we use the UID instead). So we look through the
- # message and use the first data of the right type we find.
- rfc822_data = None
- for msg_data in data.itervalues():
- if self.rfc822_key in msg_data:
- rfc822_data = msg_data[self.rfc822_key]
- break
- if rfc822_data is None:
- raise BadIMAPResponseError("FETCH response", response_data)
- try:
- new_msg = email.message_from_string(rfc822_data, IMAPMessage)
- # We use a general 'except' because the email package doesn't
- # always return email.Errors (it can return a TypeError, for
- # example) if the email is invalid. In any case, we want
- # to keep going, and not crash, because we might leave the
- # user's mailbox in a bad state if we do. Better to soldier on.
- except:
- # Yikes! Barry set this to return at this point, which
- # would work ok for training (IIRC, that's all he's
- # using it for), but for filtering, what happens is that
- # the message ends up blank, but ok, so the original is
- # flagged to be deleted, and a new (almost certainly
- # unsure) message, *with only the spambayes headers* is
- # created. The nice solution is still to do what sb_server
- # does and have a X-Spambayes-Exception header with the
- # exception data and then the original message.
- self.invalid = True
- text, details = message.insert_exception_header(
- rfc822_data, self.id)
- self.invalid_content = text
- self.got_substance = True
- # Print the exception and a traceback.
- print >> sys.stderr, details
- return self
- new_msg.folder = self.folder
- new_msg.previous_folder = self.previous_folder
- new_msg.rfc822_command = self.rfc822_command
- new_msg.rfc822_key = self.rfc822_key
- new_msg.imap_server = self.imap_server
- new_msg.uid = self.uid
- new_msg.setId(self.id)
- new_msg.got_substance = True
- if not new_msg.has_key(options["Headers", "mailid_header_name"]):
- new_msg[options["Headers", "mailid_header_name"]] = self.id
- if options["globals", "verbose"]:
- sys.stdout.write(chr(8) + "*")
- return new_msg
- def MoveTo(self, dest):
- '''Note that message should move to another folder. No move is
- carried out until Save() is called, for efficiency.'''
- if self.previous_folder is None:
- self.previous_folder = self.folder
- self.folder = dest
- def as_string(self, unixfrom=False):
- # Basically the same as the parent class's except that we handle
- # the case where the data was unparsable, so we haven't done any
- # filtering, and we are not actually a proper email.Message object.
- # We also don't mangle the from line; the server must take care of
- # this.
- if self.invalid:
- return self._force_CRLF(self.invalid_content)
- else:
- return message.SBHeaderMessage.as_string(self, unixfrom,
- mangle_from_=False)
- recent_re = re.compile(r"\\Recent ?| ?\\Recent")
- def Save(self):
- """Save message to IMAP server.
- We can't actually update the message with IMAP, so what we do is
- create a new message and delete the old one."""
- assert self.folder is not None, \
- "Can't save a message that doesn't have a folder."
- assert self.id, "Can't save a message that doesn't have an id."
- assert self.imap_server, "Can't do anything without IMAP connection."
- response = self.imap_server.uid("FETCH", self.uid,
- "(FLAGS INTERNALDATE)")
- command = "fetch %s (flags internaldate)" % (self.uid,)
- response_data = self.imap_server.check_response(command, response)
- data = self.imap_server.extract_fetch_data(response_data)
- # The data will be a dictionary - hopefully with only one element,
- # but maybe more than one. The key is the message number, which we
- # do not have (we use the UID instead). So we look through the
- # message and use the last data of the right type we find.
- msg_time = self.extractTime()
- flags = None
- for msg_data in data.itervalues():
- if "INTERNALDATE" in msg_data:
- msg_time = msg_data["INTERNALDATE"]
- if "FLAGS" in msg_data:
- flags = msg_data["FLAGS"]
- # The \Recent flag can be fetched, but cannot be stored
- # We must remove it from the list if it is there.
- flags = self.recent_re.sub("", flags)
-
- # We try to save with flags and time, then with just the
- # time, then with the flags and the current time, then with just
- # the current time. The first should work, but the first three
- # sometimes (due to the quirky IMAP server) fail.
- for flgs, tme in [(flags, msg_time),
- (None, msg_time),
- (flags, Time2Internaldate(time.time())),
- (None, Time2Internaldate(time.time()))]:
- try:
- response = self.imap_server.append(self.folder.name, flgs, tme,
- self.as_string())
- except BaseIMAP.error:
- continue
- try:
- self.imap_server.check_response("", response)
- except BadIMAPResponseError:
- pass
- else:
- break
- else:
- command = "append %s %s %s %s" % (self.folder.name, flgs, tme,
- self.as_string)
- raise BadIMAPResponseError(command)
- if self.previous_folder is None:
- self.imap_server.SelectFolder(self.folder.name)
- else:
- self.imap_server.SelectFolder(self.previous_folder.name)
- self.previous_folder = None
- response = self.imap_server.uid("STORE", self.uid, "+FLAGS.SILENT",
- "(\\Deleted \\Seen)")
- command = "set %s to be deleted and seen" % (self.uid,)
- self.imap_server.check_response(command, response)
- # Not all IMAP servers immediately offer the new message, but
- # we need to find it to get the new UID. We need to wait until
- # the server offers up an EXISTS command, so we no-op until that
- # is the case.
- # See [ 941596 ] sb_imapfilter.py not adding headers / moving messages
- # We use the recent() function, which no-ops if necessary. We try
- # 100 times, and then give up. If a message arrives independantly,
- # and we are told about it before our message, then this could
- # cause trouble, but that would be one weird server.
- for i in xrange(100):
- response = self.imap_server.recent()
- data = self.imap_server.check_response("recent", response)
- if data[0] is not None:
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] found saved message", self.uid,
- print >> sys.stderr, "in iteration", i
- break
- else:
- if options["globals", "verbose"]:
- print >> sys.stderr, ("[imapfilter] can't find saved message after"
- "100 iterations:"), self.uid
- # raise BadIMAPResponseError("recent", "Cannot find saved message")
- # We need to update the UID, as it will have changed.
- # Although we don't use the UID to keep track of messages, we do
- # have to use it for IMAP operations.
- self.imap_server.SelectFolder(self.folder.name)
- search_string = "(UNDELETED HEADER %s \"%s\")" % \
- (options["Headers", "mailid_header_name"],
- self.id.replace('\\',r'\\').replace('"',r'\"'))
- response = self.imap_server.uid("SEARCH", search_string)
- data = self.imap_server.check_response("search " + search_string,
- response)
- new_id = data[0]
- # See [ 870799 ] imap trying to fetch invalid message UID
- # It seems that although the save gave a "NO" response to the
- # first save, the message was still saved (without the flags,
- # probably). This really isn't good behaviour on the server's
- # part, but, as usual, we try and deal with it. So, if we get
- # more than one undeleted message with the same SpamBayes id,
- # delete all of them apart from the last one, and use that.
- multiple_ids = new_id.split()
- for id_to_remove in multiple_ids[:-1]:
- response = self.imap_server.uid("STORE", id_to_remove,
- "+FLAGS.SILENT",
- "(\\Deleted \\Seen)")
- command = "silently delete and make seen %s" % (id_to_remove,)
- self.imap_server.check_response(command, response)
- if multiple_ids:
- new_id = multiple_ids[-1]
- else:
- # Let's hope it doesn't, but, just in case, if the search
- # turns up empty, we make the assumption that the new message
- # is the last one with a recent flag.
- response = self.imap_server.uid("SEARCH", "RECENT")
- data = self.imap_server.check_response("search recent",
- response)
- new_id = data[0]
- if new_id.find(' ') > -1:
- ids = new_id.split(' ')
- new_id = ids[-1]
- # Ok, now we're in trouble if we still haven't found it.
- # We make a huge assumption that the new message is the one
- # with the highest UID (they are sequential, so this will be
- # ok as long as another message hasn't also arrived).
- if new_id == "":
- response = self.imap_server.uid("SEARCH", "ALL")
- data = self.imap_server.check_response("search all",
- response)
- new_id = data[0]
- if new_id.find(' ') > -1:
- ids = new_id.split(' ')
- new_id = ids[-1]
- self.uid = new_id
- class IMAPFolder(object):
- def __init__(self, folder_name, imap_server, stats):
- self.name = folder_name
- self.imap_server = imap_server
- self.stats = stats
- # Unique names for cached messages - see _generate_id below.
- self.lastBaseMessageName = ''
- self.uniquifier = 2
- def __cmp__(self, obj):
- """Two folders are equal if their names are equal."""
- if obj is None:
- return False
- return cmp(self.name, obj.name)
- def __iter__(self):
- """Iterate through the messages in this IMAP folder."""
- for key in self.keys():
- yield self[key]
- def keys(self):
- '''Returns *uids* for all the messages in the folder not
- marked as deleted.'''
- self.imap_server.SelectFolder(self.name)
- response = self.imap_server.uid("SEARCH", "UNDELETED")
- data = self.imap_server.check_response("search undeleted", response)
- if data[0]:
- return data[0].split(' ')
- else:
- return []
- custom_header_id_re = re.compile(re.escape(\
- options["Headers", "mailid_header_name"]) + "\:\s*(\d+(?:\-\d)?)",
- re.IGNORECASE)
- message_id_re = re.compile("Message-ID\: ?\<([^\n\>]+)\>",
- re.IGNORECASE)
- def __getitem__(self, key):
- """Return message matching the given *uid*.
- The messages returned have no substance (so this should be
- reasonably quick, even with large messages). You need to call
- get_full_message() on the returned message to get the substance of
- the message from the server."""
- self.imap_server.SelectFolder(self.name)
- # Using RFC822.HEADER.LINES would be better here, but it seems
- # that not all servers accept it, even though it is in the RFC
- response = self.imap_server.uid("FETCH", key, "RFC822.HEADER")
- response_data = self.imap_server.check_response(\
- "fetch %s rfc822.header" % (key,), response)
- data = self.imap_server.extract_fetch_data(response_data)
- # The data will be a dictionary - hopefully with only one element,
- # but maybe more than one. The key is the message number, which we
- # do not have (we use the UID instead). So we look through the
- # message and use the first data of the right type we find.
- headers = None
- for msg_data in data.itervalues():
- if "RFC822.HEADER" in msg_data:
- headers = msg_data["RFC822.HEADER"]
- break
- if headers is None:
- raise BadIMAPResponseError("FETCH response", response_data)
- # Create a new IMAPMessage object, which will be the return value.
- msg = IMAPMessage()
- msg.folder = self
- msg.uid = key
- msg.imap_server = self.imap_server
- # We use the MessageID header as the ID for the message, as long
- # as it is available, and if not, we add our own.
- # Search for our custom id first, for backwards compatibility.
- for id_header_re in [self.custom_header_id_re, self.message_id_re]:
- mo = id_header_re.search(headers)
- if mo:
- msg.setId(mo.group(1))
- break
- else:
- newid = self._generate_id()
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] saving", msg.uid, "with new id:", newid
- msg.setId(newid)
- # Unfortunately, we now have to re-save this message, so that
- # our id is stored on the IMAP server. The vast majority of
- # messages have Message-ID headers, from what I can tell, so
- # we should only rarely have to do this. It's less often than
- # with the previous solution, anyway!
- # msg = msg.get_full_message()
- # msg.Save()
- if options["globals", "verbose"]:
- sys.stdout.write(".")
- return msg
- # Lifted straight from sb_server.py (under the name getNewMessageName)
- def _generate_id(self):
- # The message id is the time it arrived, with a uniquifier
- # appended if two arrive within one clock tick of each other.
- messageName = "%10.10d" % long(time.time())
- if messageName == self.lastBaseMessageName:
- messageName = "%s-%d" % (messageName, self.uniquifier)
- self.uniquifier += 1
- else:
- self.lastBaseMessageName = messageName
- self.uniquifier = 2
- return messageName
- def Train(self, classifier, isSpam):
- """Train folder as spam/ham."""
- num_trained = 0
- for msg in self:
- if msg.GetTrained() == (not isSpam):
- msg = msg.get_full_message()
- if msg.could_not_retrieve:
- # Something went wrong, and we couldn't even get
- # an invalid message, so just skip this one.
- # Annoyingly, we'll try to do it every time the
- # script runs, but hopefully the user will notice
- # the errors and move it soon enough.
- continue
- msg.delSBHeaders()
- classifier.unlearn(msg.tokenize(), not isSpam)
- if isSpam:
- old_class = options["Headers", "header_ham_string"]
- else:
- old_class = options["Headers", "header_spam_string"]
- # Once the message has been untrained, it's training memory
- # should reflect that on the off chance that for some
- # reason the training breaks.
- msg.RememberTrained(None)
- else:
- old_class = None
- if msg.GetTrained() is None:
- msg = msg.get_full_message()
- if msg.could_not_retrieve:
- continue
- saved_headers = msg.currentSBHeaders()
- msg.delSBHeaders()
- classifier.learn(msg.tokenize(), isSpam)
- num_trained += 1
- msg.RememberTrained(isSpam)
- self.stats.RecordTraining(not isSpam, old_class=old_class)
- if isSpam:
- move_opt_name = "move_trained_spam_to_folder"
- else:
- move_opt_name = "move_trained_ham_to_folder"
- if options["imap", move_opt_name] != "":
- # We need to restore the SpamBayes headers.
- for header, value in saved_headers.items():
- msg[header] = value
- msg.MoveTo(IMAPFolder(options["imap", move_opt_name],
- self.imap_server, self.stats))
- msg.Save()
- return num_trained
- def Filter(self, classifier, spamfolder, unsurefolder, hamfolder):
- count = {}
- count["ham"] = 0
- count["spam"] = 0
- count["unsure"] = 0
- for msg in self:
- cls = msg.GetClassification()
- if cls is None or hamfolder is not None:
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] classified as %s:" % cls, msg.uid
-
- msg = msg.get_full_message()
- if msg.could_not_retrieve:
- # Something went wrong, and we couldn't even get
- # an invalid message, so just skip this one.
- # Annoyingly, we'll try to do it every time the
- # script runs, but hopefully the user will notice
- # the errors and move it soon enough.
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] could not retrieve:", msg.uid
- continue
-
- (prob, clues) = classifier.spamprob(msg.tokenize(),
- evidence=True)
- # Add headers and remember classification.
- msg.delSBHeaders()
- msg.addSBHeaders(prob, clues)
- self.stats.RecordClassification(prob)
- cls = msg.GetClassification()
- if cls == options["Headers", "header_ham_string"]:
- if hamfolder:
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] moving to ham folder:",
- print >> sys.stderr, msg.uid
- msg.MoveTo(hamfolder)
- # Otherwise, we leave ham alone.
- count["ham"] += 1
- elif cls == options["Headers", "header_spam_string"]:
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] moving to spam folder:",
- print >> sys.stderr, msg.uid
- msg.MoveTo(spamfolder)
- count["spam"] += 1
- else:
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] moving to unsure folder:", msg.uid
- msg.MoveTo(unsurefolder)
- count["unsure"] += 1
- msg.Save()
- else:
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] already classified:", msg.uid
-
- return count
- class IMAPFilter(object):
- def __init__(self, classifier, stats):
- self.spam_folder = None
- self.unsure_folder = None
- self.ham_folder = None
- self.classifier = classifier
- self.imap_server = None
- self.stats = stats
- def Train(self):
- assert self.imap_server, "Cannot do anything without IMAP server."
-
- if options["globals", "verbose"]:
- t = time.time()
- total_trained = 0
- for is_spam, option_name in [(False, "ham_train_folders"),
- (True, "spam_train_folders")]:
- training_folders = options["imap", option_name]
- for fol in training_folders:
- # Select the folder to make sure it exists
- try:
- self.imap_server.SelectFolder(fol)
- except BadIMAPResponseError:
- print >> sys.stderr, "Skipping", fol, "as it cannot be selected."
- continue
- if options['globals', 'verbose']:
- print >> sys.stderr, (" Training %s folder %s" %
- (["ham", "spam"][is_spam], fol))
- folder = IMAPFolder(fol, self.imap_server, self.stats)
- num_trained = folder.Train(self.classifier, is_spam)
- total_trained += num_trained
- if options['globals', 'verbose']:
- print >> sys.stderr, "\n ", num_trained, "trained."
- if total_trained:
- self.classifier.store()
- if options["globals", "verbose"]:
- print >> sys.stderr, ("Training took %.4f seconds, %s messages were trained."
- % (time.time() - t, total_trained))
- def Filter(self):
- assert self.imap_server, "Cannot do anything without IMAP server."
- if not self.spam_folder:
- spam_folder_name = options["imap", "spam_folder"]
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] spam folder:", spam_folder_name
- self.spam_folder = IMAPFolder(
- spam_folder_name, self.imap_server, self.stats)
-
- if not self.unsure_folder:
- unsure_folder_name = options["imap", "unsure_folder"]
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] unsure folder:", unsure_folder_name
- self.unsure_folder = IMAPFolder(
- unsure_folder_name, self.imap_server, self.stats)
- ham_folder_name = options["imap", "ham_folder"]
- if options["globals", "verbose"]:
- print >> sys.stderr, "[imapfilter] ham folder:", ham_folder_name
-
- if ham_folder_name and not self.ham_folder:
- self.ham_folder = IMAPFolder(ham_folder_name, self.imap_server,
- self.stats)
- if options["globals", "verbose"]:
- t = time.time()
- count = {}
- count["ham"] = 0
- count["spam"] = 0
- count["unsure"] = 0
- # Select the ham, spam and unsure folders to make sure they exist.
- try:
- self.imap_server.SelectFolder(self.spam_folder.name)
- except BadIMAPResponseError:
- print >> sys.stderr, "Cannot select spam folder. Please check configuration."
- sys.exit(-1)
- try:
- self.imap_server.SelectFolder(self.unsure_folder.name)
- except BadIMAPResponseError:
- print >> sys.stderr, "Cannot select unsure folder. Please check configuration."
- sys.exit(-1)
- if self.ham_folder:
- try:
- self.imap_server.SelectFolder(self.ham_folder.name)
- except BadIMAPResponseError:
- print >> sys.stderr, "Cannot select ham folder. Please check configuration."
- sys.exit(-1)
-
- for filter_folder in options["imap", "filter_folders"]:
- # Select the folder to make sure it exists.
- try:
- self.imap_server.SelectFolder(filter_folder)
- except BadIMAPResponseError:
- print >> sys.stderr, "Cannot select", filter_folder, "... skipping."
- continue
- folder = IMAPFolder(filter_folder, self.imap_server, self.stats)
- subcount = folder.Filter(self.classifier, self.spam_folder,
- self.unsure_folder, self.ham_folder)
- for key in count.keys():
- count[key] += subcount.get(key, 0)
- if options["globals", "verbose"]:
- if count is not None:
- print >> sys.stderr, ("\nClassified %s ham, %s spam, and %s unsure." %
- (count["ham"], count["spam"], count["unsure"]))
- print >> sys.stderr, "Classifying took %.4f seconds." % (time.time() - t,)
- def servers(promptForPass = False):
- """Returns a list containing a tuple (server,user,passwd) for each IMAP server in options.
- If promptForPass is True or at least on password is missing from options,
- prompts the user for each server's password.
- """
-
- servers = options["imap", "server"]
- usernames = options["imap", "username"]
- pwds = options["imap", "password"]
- if promptForPass or len(pwds) < len(usernames):
- pwds = []
- for u in usernames:
- pwds.append(getpass("Enter password for %s:" % (u,)))
-
- return zip(servers, usernames, pwds)
-
- def run(force_UI=False):
- try:
- opts, args = getopt.getopt(sys.argv[1:], 'hbPtcvl:e:i:d:p:o:',
- ["verbose"])
- except getopt.error, msg:
- print >> sys.stderr, str(msg) + '\n\n' + __doc__
- sys.exit()
- doTrain = False
- doClassify = False
- doExpunge = options["imap", "expunge"]
- imapDebug = 0
- sleepTime = 0
- promptForPass = False
- launchUI = False
- for opt, arg in opts:
- if opt == '-h':
- print >> sys.stderr, __doc__
- sys.exit()
- elif opt == "-b":
- launchUI = True
- elif opt == '-t':
- doTrain = True
- elif opt == '-P':
- promptForPass = True
- elif opt == '-c':
- doClassify = True
- elif opt in ('-v', '--verbose'):
- options["globals", "verbose"] = True
- elif opt == '-e':
- if arg == 'y':
- doExpunge = True
- else:
- doExpunge = False
- elif opt == '-i':
- imapDebug = int(arg)
- elif opt == '-l':
- sleepTime = int(arg) * 60
- elif opt == '-o':
- options.set_from_cmdline(arg, sys.stderr)
- bdbname, useDBM = storage.database_type(opts)
- # Let the user know what they are using...
- v = get_current_version();
- print "%s.\n" % (v.get_long_version("SpamBayes IMAP Filter"),)
- if options["globals", "verbose"]:
- print "Loading database %s..." % (bdbname),
- classifier = storage.open_storage(bdbname, useDBM)
- message_db = message.Message().message_info_db
- if options["globals", "verbose"]:
- print "Done."
- if not ( launchUI or force_UI or options["imap", "server"] ):
- print "You need to specify both a server and a username."
- sys.exit()
- servers_data = servers(promptForPass)
-
- # Load stats manager.
- stats = Stats.Stats(options, message_db)
-
- imap_filter = IMAPFilter(classifier, stats)
- # Web interface. We have changed the rules about this many times.
- # With 1.0.x, the rule is that the interface is served if we are
- # not classifying or training. However, this runs into the problem
- # that if we run with -l, we might still want to edit the options,
- # and we don't want to start a separate instance, because then the
- # database is accessed from two processes.
- # With 1.1.x, the rule is that the interface is also served if the
- # -l option is used, which means it is only not served if we are
- # doing a one-off classification/train. In that case, there would
- # probably not be enough time to get to the interface and interact
- # with it (and we don't want it to die halfway through!), and we
- # don't want to slow classification/training down, either.
- if sleepTime or not (doClassify or doTrain):
- imaps = []
- for server, username, password in servers_data:
- if server == "":
- imaps.append(None)
- else:
- imaps.append(IMAPSession(server, imapDebug, doExpunge))
- def close_db():
- message_db.store()
- message_db.close()
- message.Message().message_info_db.store()
- message.Message().message_info_db.close()
- message.Message.message_info_db = None
- classifier.store()
- classifier.close()
- def change_db():
- classifier = storage.open_storage(*storage.database_type(opts))
- message.Message.message_info_db = message_db
- imap_filter = IMAPFilter(classifier, message_db)
- httpServer = UserInterfaceServer(options["html_ui", "port"])
- pwds = [ x[2] for x in servers_data ]
- httpServer.register(IMAPUserInterface(classifier, imaps, pwds,
- IMAPSession, stats=stats,
- close_db=close_db,
- change_db=change_db))
- launchBrowser = launchUI or options["html_ui", "launch_browser"]
- if sleepTime:
- # Run in a separate thread, as we have more work to do.
- thread.start_new_thread(Dibbler.run, (),
- {"launchBrowser":launchBrowser})
- else:
- Dibbler.run(launchBrowser=launchBrowser)
- if doClassify or doTrain:
- imaps = []
- for server, username, password in servers_data:
- imaps.append(((server, imapDebug, doExpunge),
- username, password))
- # In order to make working with multiple servers easier, we
- # allow the user to have separate configuration files for each
- # server. These may specify different folders to watch, different
- # spam/unsure folders, or any other options (e.g. thresholds).
- # For each server we use the default (global) options, and load
- # the specific options on top. To facilitate this, we use a
- # restore point for the options with just the default (global)
- # options.
- # XXX What about when we are running with -l and change options
- # XXX via the web interface? We need to handle that, really.
- options.set_restore_point()
- while True:
- for (server, imapDebug, doExpunge), username, password in imaps:
- imap = IMAPSession(server, imapDebug, doExpunge)
- if options["globals", "verbose"]:
- print "Account: %s:%s" % (imap.server, imap.port)
- if imap.connected:
- # As above, we load a separate configuration file
- # for each server, if it exists. We look for a
- # file in the optionsPathname directory, with the
- # name server.name.ini or .spambayes_server_name_rc
- # XXX While 1.1 is in alpha these names can be
- # XXX changed if desired. Please let Tony know!
- basedir = os.path.dirname(optionsPathname)
- fn1 = os.path.join(basedir, imap.server + ".ini")
- fn2 = os.path.join(basedir,
- imap.server.replace(".", "_") + \
- "_rc")
- for fn in (fn1, fn2):
- if os.path.exists(fn):
- options.merge_file(fn)
- try:
- imap.login(username, password)
- except LoginFailure, e:
- print str(e)
- continue
- imap_filter.imap_server = imap
- if doTrain:
- if options["globals", "verbose"]:
- print "Training"
- imap_filter.Train()
- if doClassify:
- if options["globals", "verbose"]:
- print "Classifying"
- imap_filter.Filter()
- imap.logout()
- options.revert_to_restore_point()
- else:
- # Failed to connect. This may be a temporary problem,
- # so just continue on and try again. If we are only
- # running once we will end, otherwise we'll try again
- # in sleepTime seconds.
- # XXX Maybe we should log this error message?
- pass
- if sleepTime:
- time.sleep(sleepTime)
- else:
- break
- if __name__ == '__main__':
- run()