/fstmerge/examples/SpamBayes/rev3250-3267/base-trunk-3250/scripts/sb_mboxtrain.py
Python | 272 lines | 248 code | 0 blank | 24 comment | 0 complexity | a7ecdd2a73a98d046932cb94ecd75f9a MD5 | raw file
- """Usage: %(program)s [OPTIONS] ...
- Where OPTIONS is one or more of:
- -h
- show usage and exit
- -d DBNAME
- use the DBM store. A DBM file is larger than the pickle and
- creating it is slower, but loading it is much faster,
- especially for large word databases. Recommended for use with
- sb_filter or any procmail-based filter.
- -p DBNAME
- use the pickle store. A pickle is smaller and faster to create,
- but much slower to load. Recommended for use with sb_server and
- sb_xmlrpcserver.
- -g PATH
- mbox or directory of known good messages (non-spam) to train on.
- Can be specified more than once.
- -s PATH
- mbox or directory of known spam messages to train on.
- Can be specified more than once.
- -f
- force training, ignoring the trained header. Use this if you
- need to rebuild your database from scratch.
- -q
- quiet mode; no output
- -n train mail residing in "new" directory, in addition to "cur"
- directory, which is always trained (Maildir only)
- -r remove mail which was trained on (Maildir only)
- -o section:option:value
- set [section, option] in the options database to value
- """
- import sys, os, getopt, email
- import shutil
- from spambayes import hammie, storage, mboxutils
- from spambayes.Options import options, get_pathname_option
- program = sys.argv[0]
- loud = True
- def get_message(obj):
- """Return an email Message object.
- This works like mboxutils.get_message, except it doesn't junk the
- headers if there's an error. Doing so would cause a headerless
- message to be written back out!
- """
- if isinstance(obj, email.Message.Message):
- return obj
- if hasattr(obj, "read"):
- obj = obj.read()
- try:
- msg = email.message_from_string(obj)
- except email.Errors.MessageParseError:
- msg = None
- return msg
- def msg_train(h, msg, is_spam, force):
- """Train bayes with a single message."""
- try:
- mboxutils.as_string(msg)
- except TypeError:
- return False
- if is_spam:
- spamtxt = options["Headers", "header_spam_string"]
- else:
- spamtxt = options["Headers", "header_ham_string"]
- oldtxt = msg.get(options["Headers", "trained_header_name"])
- if force:
- if oldtxt != None:
- del msg[options["Headers", "trained_header_name"]]
- elif oldtxt == spamtxt:
- return False
- elif oldtxt != None:
- del msg[options["Headers", "trained_header_name"]]
- h.untrain(msg, not is_spam)
- h.train(msg, is_spam)
- msg.add_header(options["Headers", "trained_header_name"], spamtxt)
- return True
- def maildir_train(h, path, is_spam, force, removetrained):
- """Train bayes with all messages from a maildir."""
- if loud:
- print " Reading %s as Maildir" % (path,)
- import time
- import socket
- pid = os.getpid()
- host = socket.gethostname()
- counter = 0
- trained = 0
- for fn in os.listdir(path):
- cfn = os.path.join(path, fn)
- tfn = os.path.normpath(os.path.join(path, "..", "tmp",
- "%d.%d_%d.%s" % (time.time(), pid,
- counter, host)))
- if (os.path.isdir(cfn)):
- continue
- counter += 1
- if loud and counter % 10 == 0:
- sys.stdout.write("\r%6d" % counter)
- sys.stdout.flush()
- f = file(cfn, "rb")
- msg = get_message(f)
- f.close()
- if not msg:
- print "Malformed message: %s. Skipping..." % cfn
- continue
- if not msg_train(h, msg, is_spam, force):
- continue
- trained += 1
- if not options["Headers", "include_trained"]:
- continue
- f = file(tfn, "wb")
- f.write(mboxutils.as_string(msg))
- f.close()
- shutil.copystat(cfn, tfn)
- os.rename(tfn, cfn)
- if (removetrained):
- os.unlink(cfn)
- if loud:
- sys.stdout.write("\r%6d" % counter)
- sys.stdout.write("\r Trained %d out of %d messages\n" %
- (trained, counter))
- def mbox_train(h, path, is_spam, force):
- """Train bayes with a Unix mbox"""
- if loud:
- print " Reading as Unix mbox"
- import mailbox
- import fcntl
- f = file(path, "r+b")
- fcntl.flock(f, fcntl.LOCK_EX)
- mbox = mailbox.PortableUnixMailbox(f, get_message)
- outf = os.tmpfile()
- counter = 0
- trained = 0
- for msg in mbox:
- if not msg:
- print "Malformed message number %d. I can't train on this mbox, sorry." % counter
- return
- counter += 1
- if loud and counter % 10 == 0:
- sys.stdout.write("\r%6d" % counter)
- sys.stdout.flush()
- if msg_train(h, msg, is_spam, force):
- trained += 1
- if options["Headers", "include_trained"]:
- outf.write(mboxutils.as_string(msg, True))
- if options["Headers", "include_trained"]:
- outf.seek(0)
- try:
- os.ftruncate(f.fileno(), 0)
- f.seek(0)
- except:
- print "Problem truncating mbox--nothing written"
- raise
- try:
- for line in outf.xreadlines():
- f.write(line)
- except:
- print >> sys.stderr ("Problem writing mbox! Sorry, "
- "I tried my best, but your mail "
- "may be corrupted.")
- raise
- fcntl.flock(f, fcntl.LOCK_UN)
- f.close()
- if loud:
- sys.stdout.write("\r%6d" % counter)
- sys.stdout.write("\r Trained %d out of %d messages\n" %
- (trained, counter))
- def mhdir_train(h, path, is_spam, force):
- """Train bayes with an mh directory"""
- if loud:
- print " Reading as MH mailbox"
- import glob
- counter = 0
- trained = 0
- for fn in glob.glob(os.path.join(path, "[0-9]*")):
- counter += 1
- cfn = fn
- tfn = os.path.join(path, "spambayes.tmp")
- if loud and counter % 10 == 0:
- sys.stdout.write("\r%6d" % counter)
- sys.stdout.flush()
- f = file(fn, "rb")
- msg = get_message(f)
- f.close()
- if not msg:
- print "Malformed message: %s. Skipping..." % cfn
- continue
- msg_train(h, msg, is_spam, force)
- trained += 1
- if not options["Headers", "include_trained"]:
- continue
- f = file(tfn, "wb")
- f.write(mboxutils.as_string(msg))
- f.close()
- shutil.copystat(cfn, tfn)
- os.rename(tfn, cfn)
- if loud:
- sys.stdout.write("\r%6d" % counter)
- sys.stdout.write("\r Trained %d out of %d messages\n" %
- (trained, counter))
- def train(h, path, is_spam, force, trainnew, removetrained):
- if not os.path.exists(path):
- raise ValueError("Nonexistent path: %s" % path)
- elif os.path.isfile(path):
- mbox_train(h, path, is_spam, force)
- elif os.path.isdir(os.path.join(path, "cur")):
- maildir_train(h, os.path.join(path, "cur"), is_spam, force,
- removetrained)
- if trainnew:
- maildir_train(h, os.path.join(path, "new"), is_spam, force,
- removetrained)
- elif os.path.isdir(path):
- mhdir_train(h, path, is_spam, force)
- else:
- raise ValueError("Unable to determine mailbox type: " + path)
- def usage(code, msg=''):
- """Print usage message and sys.exit(code)."""
- if msg:
- print >> sys.stderr, msg
- print >> sys.stderr
- print >> sys.stderr, __doc__ % globals()
- sys.exit(code)
- def main():
- """Main program; parse options and go."""
- global loud
- try:
- opts, args = getopt.getopt(sys.argv[1:], 'hfqnrd:p:g:s:o:')
- except getopt.error, msg:
- usage(2, msg)
- if not opts:
- usage(2, "No options given")
- force = False
- trainnew = False
- removetrained = False
- good = []
- spam = []
- for opt, arg in opts:
- if opt == '-h':
- usage(0)
- elif opt == "-f":
- force = True
- elif opt == "-n":
- trainnew = True
- elif opt == "-q":
- loud = False
- elif opt == '-g':
- good.append(arg)
- elif opt == '-s':
- spam.append(arg)
- elif opt == "-r":
- removetrained = True
- elif opt == '-o':
- options.set_from_cmdline(arg, sys.stderr)
- pck, usedb = storage.database_type(opts)
- if args:
- usage(2, "Positional arguments not allowed")
- if usedb == None:
- usedb = options["Storage", "persistent_use_database"]
- pck = get_pathname_option("Storage",
- "persistent_storage_file")
- h = hammie.open(pck, usedb, "c")
- for g in good:
- if loud:
- print "Training ham (%s):" % g
- train(h, g, False, force, trainnew, removetrained)
- sys.stdout.flush()
- save = True
- for s in spam:
- if loud:
- print "Training spam (%s):" % s
- train(h, s, True, force, trainnew, removetrained)
- sys.stdout.flush()
- save = True
- if save:
- h.store()
- if __name__ == "__main__":
- main()