PageRenderTime 86ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/fstmerge/examples/SpamBayes/rev3250-3267/base-trunk-3250/scripts/sb_mboxtrain.py

https://github.com/RoDaniel/featurehouse
Python | 272 lines | 248 code | 0 blank | 24 comment | 0 complexity | a7ecdd2a73a98d046932cb94ecd75f9a MD5 | raw file
  1. """Usage: %(program)s [OPTIONS] ...
  2. Where OPTIONS is one or more of:
  3. -h
  4. show usage and exit
  5. -d DBNAME
  6. use the DBM store. A DBM file is larger than the pickle and
  7. creating it is slower, but loading it is much faster,
  8. especially for large word databases. Recommended for use with
  9. sb_filter or any procmail-based filter.
  10. -p DBNAME
  11. use the pickle store. A pickle is smaller and faster to create,
  12. but much slower to load. Recommended for use with sb_server and
  13. sb_xmlrpcserver.
  14. -g PATH
  15. mbox or directory of known good messages (non-spam) to train on.
  16. Can be specified more than once.
  17. -s PATH
  18. mbox or directory of known spam messages to train on.
  19. Can be specified more than once.
  20. -f
  21. force training, ignoring the trained header. Use this if you
  22. need to rebuild your database from scratch.
  23. -q
  24. quiet mode; no output
  25. -n train mail residing in "new" directory, in addition to "cur"
  26. directory, which is always trained (Maildir only)
  27. -r remove mail which was trained on (Maildir only)
  28. -o section:option:value
  29. set [section, option] in the options database to value
  30. """
  31. import sys, os, getopt, email
  32. import shutil
  33. from spambayes import hammie, storage, mboxutils
  34. from spambayes.Options import options, get_pathname_option
  35. program = sys.argv[0]
  36. loud = True
  37. def get_message(obj):
  38. """Return an email Message object.
  39. This works like mboxutils.get_message, except it doesn't junk the
  40. headers if there's an error. Doing so would cause a headerless
  41. message to be written back out!
  42. """
  43. if isinstance(obj, email.Message.Message):
  44. return obj
  45. if hasattr(obj, "read"):
  46. obj = obj.read()
  47. try:
  48. msg = email.message_from_string(obj)
  49. except email.Errors.MessageParseError:
  50. msg = None
  51. return msg
  52. def msg_train(h, msg, is_spam, force):
  53. """Train bayes with a single message."""
  54. try:
  55. mboxutils.as_string(msg)
  56. except TypeError:
  57. return False
  58. if is_spam:
  59. spamtxt = options["Headers", "header_spam_string"]
  60. else:
  61. spamtxt = options["Headers", "header_ham_string"]
  62. oldtxt = msg.get(options["Headers", "trained_header_name"])
  63. if force:
  64. if oldtxt != None:
  65. del msg[options["Headers", "trained_header_name"]]
  66. elif oldtxt == spamtxt:
  67. return False
  68. elif oldtxt != None:
  69. del msg[options["Headers", "trained_header_name"]]
  70. h.untrain(msg, not is_spam)
  71. h.train(msg, is_spam)
  72. msg.add_header(options["Headers", "trained_header_name"], spamtxt)
  73. return True
  74. def maildir_train(h, path, is_spam, force, removetrained):
  75. """Train bayes with all messages from a maildir."""
  76. if loud:
  77. print " Reading %s as Maildir" % (path,)
  78. import time
  79. import socket
  80. pid = os.getpid()
  81. host = socket.gethostname()
  82. counter = 0
  83. trained = 0
  84. for fn in os.listdir(path):
  85. cfn = os.path.join(path, fn)
  86. tfn = os.path.normpath(os.path.join(path, "..", "tmp",
  87. "%d.%d_%d.%s" % (time.time(), pid,
  88. counter, host)))
  89. if (os.path.isdir(cfn)):
  90. continue
  91. counter += 1
  92. if loud and counter % 10 == 0:
  93. sys.stdout.write("\r%6d" % counter)
  94. sys.stdout.flush()
  95. f = file(cfn, "rb")
  96. msg = get_message(f)
  97. f.close()
  98. if not msg:
  99. print "Malformed message: %s. Skipping..." % cfn
  100. continue
  101. if not msg_train(h, msg, is_spam, force):
  102. continue
  103. trained += 1
  104. if not options["Headers", "include_trained"]:
  105. continue
  106. f = file(tfn, "wb")
  107. f.write(mboxutils.as_string(msg))
  108. f.close()
  109. shutil.copystat(cfn, tfn)
  110. os.rename(tfn, cfn)
  111. if (removetrained):
  112. os.unlink(cfn)
  113. if loud:
  114. sys.stdout.write("\r%6d" % counter)
  115. sys.stdout.write("\r Trained %d out of %d messages\n" %
  116. (trained, counter))
  117. def mbox_train(h, path, is_spam, force):
  118. """Train bayes with a Unix mbox"""
  119. if loud:
  120. print " Reading as Unix mbox"
  121. import mailbox
  122. import fcntl
  123. f = file(path, "r+b")
  124. fcntl.flock(f, fcntl.LOCK_EX)
  125. mbox = mailbox.PortableUnixMailbox(f, get_message)
  126. outf = os.tmpfile()
  127. counter = 0
  128. trained = 0
  129. for msg in mbox:
  130. if not msg:
  131. print "Malformed message number %d. I can't train on this mbox, sorry." % counter
  132. return
  133. counter += 1
  134. if loud and counter % 10 == 0:
  135. sys.stdout.write("\r%6d" % counter)
  136. sys.stdout.flush()
  137. if msg_train(h, msg, is_spam, force):
  138. trained += 1
  139. if options["Headers", "include_trained"]:
  140. outf.write(mboxutils.as_string(msg, True))
  141. if options["Headers", "include_trained"]:
  142. outf.seek(0)
  143. try:
  144. os.ftruncate(f.fileno(), 0)
  145. f.seek(0)
  146. except:
  147. print "Problem truncating mbox--nothing written"
  148. raise
  149. try:
  150. for line in outf.xreadlines():
  151. f.write(line)
  152. except:
  153. print >> sys.stderr ("Problem writing mbox! Sorry, "
  154. "I tried my best, but your mail "
  155. "may be corrupted.")
  156. raise
  157. fcntl.flock(f, fcntl.LOCK_UN)
  158. f.close()
  159. if loud:
  160. sys.stdout.write("\r%6d" % counter)
  161. sys.stdout.write("\r Trained %d out of %d messages\n" %
  162. (trained, counter))
  163. def mhdir_train(h, path, is_spam, force):
  164. """Train bayes with an mh directory"""
  165. if loud:
  166. print " Reading as MH mailbox"
  167. import glob
  168. counter = 0
  169. trained = 0
  170. for fn in glob.glob(os.path.join(path, "[0-9]*")):
  171. counter += 1
  172. cfn = fn
  173. tfn = os.path.join(path, "spambayes.tmp")
  174. if loud and counter % 10 == 0:
  175. sys.stdout.write("\r%6d" % counter)
  176. sys.stdout.flush()
  177. f = file(fn, "rb")
  178. msg = get_message(f)
  179. f.close()
  180. if not msg:
  181. print "Malformed message: %s. Skipping..." % cfn
  182. continue
  183. msg_train(h, msg, is_spam, force)
  184. trained += 1
  185. if not options["Headers", "include_trained"]:
  186. continue
  187. f = file(tfn, "wb")
  188. f.write(mboxutils.as_string(msg))
  189. f.close()
  190. shutil.copystat(cfn, tfn)
  191. os.rename(tfn, cfn)
  192. if loud:
  193. sys.stdout.write("\r%6d" % counter)
  194. sys.stdout.write("\r Trained %d out of %d messages\n" %
  195. (trained, counter))
  196. def train(h, path, is_spam, force, trainnew, removetrained):
  197. if not os.path.exists(path):
  198. raise ValueError("Nonexistent path: %s" % path)
  199. elif os.path.isfile(path):
  200. mbox_train(h, path, is_spam, force)
  201. elif os.path.isdir(os.path.join(path, "cur")):
  202. maildir_train(h, os.path.join(path, "cur"), is_spam, force,
  203. removetrained)
  204. if trainnew:
  205. maildir_train(h, os.path.join(path, "new"), is_spam, force,
  206. removetrained)
  207. elif os.path.isdir(path):
  208. mhdir_train(h, path, is_spam, force)
  209. else:
  210. raise ValueError("Unable to determine mailbox type: " + path)
  211. def usage(code, msg=''):
  212. """Print usage message and sys.exit(code)."""
  213. if msg:
  214. print >> sys.stderr, msg
  215. print >> sys.stderr
  216. print >> sys.stderr, __doc__ % globals()
  217. sys.exit(code)
  218. def main():
  219. """Main program; parse options and go."""
  220. global loud
  221. try:
  222. opts, args = getopt.getopt(sys.argv[1:], 'hfqnrd:p:g:s:o:')
  223. except getopt.error, msg:
  224. usage(2, msg)
  225. if not opts:
  226. usage(2, "No options given")
  227. force = False
  228. trainnew = False
  229. removetrained = False
  230. good = []
  231. spam = []
  232. for opt, arg in opts:
  233. if opt == '-h':
  234. usage(0)
  235. elif opt == "-f":
  236. force = True
  237. elif opt == "-n":
  238. trainnew = True
  239. elif opt == "-q":
  240. loud = False
  241. elif opt == '-g':
  242. good.append(arg)
  243. elif opt == '-s':
  244. spam.append(arg)
  245. elif opt == "-r":
  246. removetrained = True
  247. elif opt == '-o':
  248. options.set_from_cmdline(arg, sys.stderr)
  249. pck, usedb = storage.database_type(opts)
  250. if args:
  251. usage(2, "Positional arguments not allowed")
  252. if usedb == None:
  253. usedb = options["Storage", "persistent_use_database"]
  254. pck = get_pathname_option("Storage",
  255. "persistent_storage_file")
  256. h = hammie.open(pck, usedb, "c")
  257. for g in good:
  258. if loud:
  259. print "Training ham (%s):" % g
  260. train(h, g, False, force, trainnew, removetrained)
  261. sys.stdout.flush()
  262. save = True
  263. for s in spam:
  264. if loud:
  265. print "Training spam (%s):" % s
  266. train(h, s, True, force, trainnew, removetrained)
  267. sys.stdout.flush()
  268. save = True
  269. if save:
  270. h.store()
  271. if __name__ == "__main__":
  272. main()