PageRenderTime 29ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/Release_2_1b1/mailman/Mailman/Handlers/Scrubber.py

#
Python | 309 lines | 264 code | 11 blank | 34 comment | 11 complexity | a8961c99727342c0d2f117a0a7d74654 MD5 | raw file
Possible License(s): GPL-2.0
  1. # Copyright (C) 2001,2002 by the Free Software Foundation, Inc.
  2. #
  3. # This program is free software; you can redistribute it and/or
  4. # modify it under the terms of the GNU General Public License
  5. # as published by the Free Software Foundation; either version 2
  6. # of the License, or (at your option) any later version.
  7. #
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program; if not, write to the Free Software
  15. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  16. """Cleanse a message for archiving.
  17. """
  18. import os
  19. import re
  20. import sha
  21. import cgi
  22. import errno
  23. import mimetypes
  24. import tempfile
  25. from cStringIO import StringIO
  26. from types import IntType
  27. from email.Parser import HeaderParser
  28. from email.Generator import Generator
  29. from Mailman import mm_cfg
  30. from Mailman import Utils
  31. from Mailman import LockFile
  32. from Mailman import Message
  33. from Mailman.Errors import DiscardMessage
  34. from Mailman.i18n import _
  35. from Mailman.Logging.Syslog import syslog
  36. # Path characters for common platforms
  37. pre = re.compile(r'[/\\:]')
  38. # All other characters to strip out of Content-Disposition: filenames
  39. # (essentially anything that isn't an alphanum, dot, slash, or underscore.
  40. sre = re.compile(r'[^-\w.]')
  41. BR = '<br>\n'
  42. # We're using a subclass of the standard Generator because we want to suppress
  43. # headers in the subparts of multiparts. We use a hack -- the ctor argument
  44. # skipheaders to accomplish this. It's set to true for the outer Message
  45. # object, but false for all internal objects. We recognize that
  46. # sub-Generators will get created passing only mangle_from_ and maxheaderlen
  47. # to the ctors.
  48. #
  49. # This isn't perfect because we still get stuff like the multipart boundaries,
  50. # but see below for how we corrupt that to our nefarious goals.
  51. class ScrubberGenerator(Generator):
  52. def __init__(self, outfp, mangle_from_=1, maxheaderlen=78, skipheaders=1):
  53. Generator.__init__(self, outfp, mangle_from_=0)
  54. self.__skipheaders = skipheaders
  55. def _write_headers(self, msg):
  56. if not self.__skipheaders:
  57. Generator._write_headers(self, msg)
  58. def process(mlist, msg, msgdata=None):
  59. sanitize = mm_cfg.ARCHIVE_HTML_SANITIZER
  60. outer = 1
  61. for part in msg.walk():
  62. # If the part is text/plain, we leave it alone
  63. if part.get_type('text/plain') == 'text/plain':
  64. pass
  65. elif part.get_type() == 'text/html' and isinstance(sanitize, IntType):
  66. if sanitize == 0:
  67. if outer:
  68. raise DiscardMessage
  69. part.set_payload(_('HTML attachment scrubbed and removed'))
  70. elif sanitize == 2:
  71. # By leaving it alone, Pipermail will automatically escape it
  72. pass
  73. elif sanitize == 3:
  74. # Pull it out as an attachment but leave it unescaped. This
  75. # is dangerous, but perhaps useful for heavily moderated
  76. # lists.
  77. omask = os.umask(002)
  78. try:
  79. url = save_attachment(mlist, part, filter_html=0)
  80. finally:
  81. os.umask(omask)
  82. part.set_payload(_("""\
  83. An HTML attachment was scrubbed...
  84. URL: %(url)s
  85. """))
  86. else:
  87. # HTML-escape it and store it as an attachment, but make it
  88. # look a /little/ bit prettier. :(
  89. payload = cgi.escape(part.get_payload())
  90. # For whitespace in the margin, change spaces into
  91. # non-breaking spaces, and tabs into 8 of those. Then use a
  92. # mono-space font. Still looks hideous to me, but then I'd
  93. # just as soon discard them.
  94. def doreplace(s):
  95. return s.replace(' ', '&nbsp;').replace('\t', '&nbsp'*8)
  96. lines = [doreplace(s) for s in payload.split('\n')]
  97. payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
  98. part.set_payload(payload)
  99. omask = os.umask(002)
  100. try:
  101. url = save_attachment(mlist, part, filter_html=0)
  102. finally:
  103. os.umask(omask)
  104. part.set_payload(_("""\
  105. An HTML attachment was scrubbed...
  106. URL: %(url)s
  107. """))
  108. elif part.get_type() == 'message/rfc822':
  109. # This part contains a submessage, so it too needs scrubbing
  110. submsg = part.get_payload()
  111. omask = os.umask(002)
  112. try:
  113. url = save_attachment(mlist, part)
  114. finally:
  115. os.umask(omask)
  116. subject = submsg.get('subject', _('no subject'))
  117. date = submsg.get('date', _('no date'))
  118. who = submsg.get('from', _('unknown sender'))
  119. size = len(str(submsg))
  120. part.set_payload(_("""\
  121. An embedded message was scrubbed...
  122. From: %(who)s
  123. Subject: %(subject)s
  124. Date: %(date)s
  125. Size: %(size)s
  126. Url: %(url)s
  127. """))
  128. # If we were to leave the message/rfc822 Content-Type: header, it
  129. # would confuse the generator. So just delete it. The generator
  130. # will treat this as a text/plain message.
  131. del part['content-type']
  132. # If the message isn't a multipart, then we'll strip it out as an
  133. # attachment that would have to be separately downloaded. Pipermail
  134. # will transform the url into a hyperlink.
  135. elif not part.is_multipart():
  136. payload = part.get_payload()
  137. ctype = part.get_type()
  138. size = len(payload)
  139. omask = os.umask(002)
  140. try:
  141. url = save_attachment(mlist, part)
  142. finally:
  143. os.umask(omask)
  144. desc = part.get('content-description', _('not available'))
  145. filename = part.get_filename(_('not available'))
  146. part.set_payload(_("""\
  147. A non-text attachment was scrubbed...
  148. Name: %(filename)s
  149. Type: %(ctype)s
  150. Size: %(size)d bytes
  151. Desc: %(desc)s
  152. Url : %(url)s
  153. """))
  154. outer = 0
  155. # We still have to sanitize the message to flat text because Pipermail
  156. # can't handle messages with list payloads. This is a kludge (def (n)
  157. # clever hack ;).
  158. if msg.is_multipart():
  159. # We're corrupting the boundary to provide some more useful
  160. # information, because while we can suppress subpart headers, we can't
  161. # suppress the inter-part boundary without a redesign of the Generator
  162. # class or a rewrite of of the whole _handle_multipart() method.
  163. msg.set_boundary('%s %s attachment' %
  164. ('-'*20, msg.get_type('text/plain')))
  165. sfp = StringIO()
  166. g = ScrubberGenerator(sfp, mangle_from_=0, skipheaders=0)
  167. g(msg)
  168. sfp.seek(0)
  169. # We don't care about parsing the body because we've already scrubbed
  170. # it of nasty stuff. Just slurp it all in.
  171. msg = HeaderParser(Message.Message).parse(sfp)
  172. return msg
  173. def save_attachment(mlist, msg, filter_html=1):
  174. # The directory to store the attachment in
  175. dir = os.path.join(mlist.archive_dir(), 'attachments')
  176. try:
  177. os.mkdir(dir, 02775)
  178. except OSError, e:
  179. if e.errno <> errno.EEXIST: raise
  180. # We need a directory to contain this message's attachments. Base it
  181. # on the Message-ID: so that all attachments for the same message end
  182. # up in the same directory (we'll uniquify the filenames in that
  183. # directory as needed). We use the first 2 and last 2 bytes of the
  184. # SHA1 has of the message id as the basis of the directory name.
  185. # Clashes here don't really matter too much, and that still gives us a
  186. # 32-bit space to work with.
  187. msgid = msg['message-id']
  188. if msgid is None:
  189. msgid = msg['Message-ID'] = Utils.unique_message_id(mlist)
  190. # We assume that the message id actually /is/ unique!
  191. digest = sha.new(msgid).hexdigest()
  192. msgdir = digest[:4] + digest[-4:]
  193. try:
  194. os.mkdir(os.path.join(dir, msgdir), 02775)
  195. except OSError, e:
  196. if e.errno <> errno.EEXIST: raise
  197. # Figure out the attachment type and get the decoded data
  198. decodedpayload = msg.get_payload(decode=1)
  199. # BAW: mimetypes ought to handle non-standard, but commonly found types,
  200. # e.g. image/jpg (should be image/jpeg). For now we just store such
  201. # things as application/octet-streams since that seems the safest.
  202. ext = mimetypes.guess_extension(msg.get_type())
  203. if not ext:
  204. # We don't know what it is, so assume it's just a shapeless
  205. # application/octet-stream, unless the Content-Type: is
  206. # message/rfc822, in which case we know we'll coerce the type to
  207. # text/plain below.
  208. if msg.get_type() == 'message/rfc822':
  209. ext = '.txt'
  210. else:
  211. ext = '.bin'
  212. path = None
  213. # We need a lock to calculate the next attachment number
  214. lockfile = os.path.join(dir, msgdir, 'attachments.lock')
  215. lock = LockFile.LockFile(lockfile)
  216. lock.lock()
  217. try:
  218. # Now base the filename on what's in the attachment, uniquifying it if
  219. # necessary.
  220. filename = msg.get_filename()
  221. if not filename:
  222. filename = 'attachment' + ext
  223. else:
  224. # Sanitize the filename given in the message headers
  225. parts = pre.split(filename)
  226. filename = parts[-1]
  227. # Allow only alphanumerics, dash, underscore, and dot
  228. filename = sre.sub('', filename)
  229. # If the filename's extension doesn't match the type we guessed,
  230. # which one should we go with? Not sure. Let's do this at least:
  231. # if the filename /has/ no extension, then tack on the one we
  232. # guessed.
  233. if not os.path.splitext(filename)[1]:
  234. filename += ext
  235. # BAW: Anything else we need to be worried about?
  236. counter = 0
  237. extra = ''
  238. while 1:
  239. path = os.path.join(dir, msgdir, filename + extra)
  240. # Generally it is not a good idea to test for file existance
  241. # before just trying to create it, but the alternatives aren't
  242. # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
  243. # NFS-safe). Besides, we have an exclusive lock now, so we're
  244. # guaranteed that no other process will be racing with us.
  245. if os.path.exists(path):
  246. counter += 1
  247. extra = '-%04d%s' % (counter, ext)
  248. else:
  249. break
  250. finally:
  251. lock.unlock()
  252. # `path' now contains the unique filename for the attachment. There's
  253. # just one more step we need to do. If the part is text/html and
  254. # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
  255. # here), then send the attachment through the filter program for
  256. # sanitization
  257. if filter_html and msg.get_type() == 'text/html':
  258. base, ext = os.path.splitext(path)
  259. tmppath = base + '-tmp' + ext
  260. fp = open(tmppath, 'w')
  261. try:
  262. fp.write(decodedpayload)
  263. fp.close()
  264. cmd = mm_cfg.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath}
  265. progfp = os.popen(cmd, 'r')
  266. decodedpayload = progfp.read()
  267. status = progfp.close()
  268. if status:
  269. syslog('error',
  270. 'HTML sanitizer exited with non-zero status: %s',
  271. status)
  272. finally:
  273. os.unlink(tmppath)
  274. # BAW: Since we've now sanitized the document, it should be plain
  275. # text. Blarg, we really want the sanitizer to tell us what the type
  276. # if the return data is. :(
  277. path = base + '.txt'
  278. filename = os.path.splitext(filename)[0] + '.txt'
  279. # Is it a message/rfc822 attachment?
  280. elif msg.get_type() == 'message/rfc822':
  281. submsg = msg.get_payload()
  282. # BAW: I'm sure we can eventually do better than this. :(
  283. decodedpayload = cgi.escape(str(submsg))
  284. fp = open(path, 'w')
  285. fp.write(decodedpayload)
  286. fp.close()
  287. # Now calculate the url
  288. baseurl = mlist.GetBaseArchiveURL()
  289. # Private archives will likely have a trailing slash. Normalize.
  290. if baseurl[-1] <> '/':
  291. baseurl += '/'
  292. url = baseurl + 'attachments/%s/%s' % (msgdir, filename)
  293. return url