/hyperkitty/lib/scrub.py
Python | 311 lines | 224 code | 12 blank | 75 comment | 28 complexity | 0fea7194a6b8dee8fb1b3ab7c75d87a2 MD5 | raw file
- # Copyright (C) 2011-2012 by the Free Software Foundation, Inc.
- #
- # This program is free software; you can redistribute it and/or
- # modify it under the terms of the GNU General Public License
- # as published by the Free Software Foundation; either version 2
- # of the License, or (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
- # USA.
- """Cleanse a message for archiving."""
- from __future__ import absolute_import, unicode_literals
- import os
- import re
- import binascii
- from types import IntType
- from mimetypes import guess_all_extensions
- from email.header import decode_header, make_header
- from email.errors import HeaderParseError
- # Path characters for common platforms
- pre = re.compile(r'[/\\:]')
- # All other characters to strip out of Content-Disposition: filenames
- # (essentially anything that isn't an alphanum, dot, dash, or underscore).
- sre = re.compile(r'[^-\w.]')
- # Regexp to strip out leading dots
- dre = re.compile(r'^\.*')
- BR = '<br>\n'
- NEXT_PART = re.compile(r'--------------[ ]next[ ]part[ ]--------------\n')
- def guess_extension(ctype, ext):
- # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
- # and .wiz are all mapped to application/msword. This sucks for finding
- # the best reverse mapping. If the extension is one of the giving
- # mappings, we'll trust that, otherwise we'll just guess. :/
- all_exts = guess_all_extensions(ctype, strict=False)
- if ext in all_exts:
- return ext
- return all_exts and all_exts[0]
- def get_charset(message, default="ascii", guess=False):
- """
- Get the message charset.
- From: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
- """
- if message.get_content_charset():
- return message.get_content_charset().decode("ascii")
- if message.get_charset():
- return message.get_charset().decode("ascii")
- charset = default
- if not guess:
- return charset
- # Try to guess the encoding (best effort mode)
- text = message.get_payload(decode=True)
- for encoding in ["ascii", "utf-8", "iso-8859-15"]:
- try:
- text.decode(encoding)
- except UnicodeDecodeError:
- continue
- else:
- charset = encoding
- break
- return charset
- def oneline(s):
- """Inspired by mailman.utilities.string.oneline"""
- try:
- h = make_header(decode_header(s))
- ustr = h.__unicode__()
- return ''.join(ustr.splitlines())
- except (LookupError, UnicodeError, ValueError, HeaderParseError):
- # possibly charset problem. return with undecoded string in one line.
- return ''.join(s.splitlines())
- class Scrubber(object):
- """
- Scrubs a single message, extracts attachments, and return the text and the
- attachments.
- See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
- """
- def __init__(self, mlist, msg):
- self.mlist = mlist
- self.msg = msg
- def scrub(self):
- attachments = []
- sanitize = 1 # TODO: implement other options
- #outer = True
- # Now walk over all subparts of this message and scrub out various types
- for part_num, part in enumerate(self.msg.walk()):
- ctype = part.get_content_type()
- if not isinstance(ctype, unicode):
- ctype = ctype.decode("ascii")
- # If the part is text/plain, we leave it alone
- if ctype == 'text/plain':
- disposition = part.get('content-disposition')
- if disposition and disposition.decode("ascii", "replace"
- ).strip().startswith("attachment"):
- # part is attached
- attachments.append(self.parse_attachment(part, part_num))
- part.set_payload('')
- elif ctype == 'text/html' and isinstance(sanitize, IntType):
- # if sanitize == 0:
- # if outer:
- # raise DiscardMessage
- # replace_payload_by_text(part,
- # _('HTML attachment scrubbed and removed'),
- # # Adding charset arg and removing content-type
- # # sets content-type to text/plain
- # lcset)
- # elif sanitize == 2:
- # # By leaving it alone, Pipermail will automatically escape it
- # pass
- # elif sanitize == 3:
- # # Pull it out as an attachment but leave it unescaped. This
- # # is dangerous, but perhaps useful for heavily moderated
- # # lists.
- # attachments.append(self.parse_attachment(part, part_num, filter_html=False))
- # replace_payload_by_text(part, _("""\
- #An HTML attachment was scrubbed...
- #URL: %(url)s
- #"""), lcset)
- # else:
- if sanitize == 1:
- # Don't HTML-escape it, this is the frontend's job
- ## HTML-escape it and store it as an attachment, but make it
- ## look a /little/ bit prettier. :(
- #payload = websafe(part.get_payload(decode=True))
- ## For whitespace in the margin, change spaces into
- ## non-breaking spaces, and tabs into 8 of those. Then use a
- ## mono-space font. Still looks hideous to me, but then I'd
- ## just as soon discard them.
- #def doreplace(s):
- # return s.expandtabs(8).replace(' ', ' ')
- #lines = [doreplace(s) for s in payload.split('\n')]
- #payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
- #part.set_payload(payload)
- ## We're replacing the payload with the decoded payload so this
- ## will just get in the way.
- #del part['content-transfer-encoding']
- attachments.append(self.parse_attachment(part, part_num, filter_html=False))
- part.set_payload('')
- elif ctype == 'message/rfc822':
- # This part contains a submessage, so it too needs scrubbing
- attachments.append(self.parse_attachment(part, part_num))
- part.set_payload('')
- # If the message isn't a multipart, then we'll strip it out as an
- # attachment that would have to be separately downloaded.
- elif part.get_payload() and not part.is_multipart():
- payload = part.get_payload(decode=True)
- ctype = part.get_content_type()
- if not isinstance(ctype, unicode):
- ctype.decode("ascii")
- # XXX Under email 2.5, it is possible that payload will be None.
- # This can happen when you have a Content-Type: multipart/* with
- # only one part and that part has two blank lines between the
- # first boundary and the end boundary. In email 3.0 you end up
- # with a string in the payload. I think in this case it's safe to
- # ignore the part.
- if payload is None:
- continue
- attachments.append(self.parse_attachment(part, part_num))
- #outer = False
- # We still have to sanitize multipart messages to flat text because
- # Pipermail can't handle messages with list payloads. This is a kludge;
- # def (n) clever hack ;).
- if self.msg.is_multipart():
- # We now want to concatenate all the parts which have been scrubbed to
- # text/plain, into a single text/plain payload. We need to make sure
- # all the characters in the concatenated string are in the same
- # encoding, so we'll use the 'replace' key in the coercion call.
- # BAW: Martin's original patch suggested we might want to try
- # generalizing to utf-8, and that's probably a good idea (eventually).
- text = []
- for part in self.msg.walk():
- # TK: bug-id 1099138 and multipart
- # MAS test payload - if part may fail if there are no headers.
- if not part.get_payload() or part.is_multipart():
- continue
- # All parts should be scrubbed to text/plain by now, except
- # if sanitize == 2, there could be text/html parts so keep them
- # but skip any other parts.
- partctype = part.get_content_type()
- if partctype != 'text/plain' and (partctype != 'text/html' or
- sanitize != 2):
- #text.append(_('Skipped content of type %(partctype)s\n'))
- continue
- try:
- t = part.get_payload(decode=True) or ''
- # MAS: TypeError exception can occur if payload is None. This
- # was observed with a message that contained an attached
- # message/delivery-status part. Because of the special parsing
- # of this type, this resulted in a text/plain sub-part with a
- # null body. See bug 1430236.
- except (binascii.Error, TypeError):
- t = part.get_payload() or ''
- partcharset = get_charset(part, guess=True)
- try:
- t = t.decode(partcharset, 'replace')
- except (UnicodeError, LookupError, ValueError,
- AssertionError):
- # We can get here if partcharset is bogus in some way.
- # Replace funny characters. We use errors='replace'
- t = t.decode('ascii', 'replace')
- # Separation is useful
- if isinstance(t, basestring):
- if not t.endswith('\n'):
- t += '\n'
- text.append(t)
- text = u"\n".join(text)
- else:
- text = self.msg.get_payload(decode=True)
- charset = get_charset(self.msg, guess=True)
- try:
- text = text.decode(charset, "replace")
- except (UnicodeError, LookupError, ValueError, AssertionError):
- text = text.decode('ascii', 'replace')
- next_part_match = NEXT_PART.search(text)
- if next_part_match:
- text = text[0:next_part_match.start(0)]
- return (text, attachments)
- def parse_attachment(self, part, counter, filter_html=True):
- # pylint: disable=unused-argument
- # Store name, content-type and size
- # Figure out the attachment type and get the decoded data
- decodedpayload = part.get_payload(decode=True)
- # BAW: mimetypes ought to handle non-standard, but commonly found types,
- # e.g. image/jpg (should be image/jpeg). For now we just store such
- # things as application/octet-streams since that seems the safest.
- ctype = part.get_content_type()
- if not isinstance(ctype, unicode):
- ctype = ctype.decode("ascii")
- charset = get_charset(part, default=None, guess=False)
- # i18n file name is encoded
- try:
- filename = oneline(part.get_filename(''))
- except (TypeError, UnicodeDecodeError):
- # Workaround for https://bugs.launchpad.net/mailman/+bug/1060951
- # (accented filenames)
- filename = u"attachment.bin"
- filename, fnext = os.path.splitext(filename)
- # For safety, we should confirm this is valid ext for content-type
- # but we can use fnext if we introduce fnext filtering
- # TODO: re-implement this
- #if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
- # # HTML message doesn't have filename :-(
- # ext = fnext or guess_extension(ctype, fnext)
- #else:
- # ext = guess_extension(ctype, fnext)
- ext = fnext or guess_extension(ctype, fnext)
- if not ext:
- # We don't know what it is, so assume it's just a shapeless
- # application/octet-stream, unless the Content-Type: is
- # message/rfc822, in which case we know we'll coerce the type to
- # text/plain below.
- if ctype == 'message/rfc822':
- ext = '.txt'
- else:
- ext = '.bin'
- # Allow only alphanumerics, dash, underscore, and dot
- ext = sre.sub('', ext)
- # Now base the filename on what's in the attachment, uniquifying it if
- # necessary.
- if not filename:
- filebase = u'attachment'
- else:
- # Sanitize the filename given in the message headers
- parts = pre.split(filename)
- filename = parts[-1]
- # Strip off leading dots
- filename = dre.sub('', filename)
- # Allow only alphanumerics, dash, underscore, and dot
- # i18n filenames are not supported yet,
- # see https://bugs.launchpad.net/bugs/1060951
- filename = sre.sub('', filename)
- # If the filename's extension doesn't match the type we guessed,
- # which one should we go with? For now, let's go with the one we
- # guessed so attachments can't lie about their type. Also, if the
- # filename /has/ no extension, then tack on the one we guessed.
- # The extension was removed from the name above.
- filebase = filename
- # TODO: bring back the HTML sanitizer feature
- if ctype == 'message/rfc822':
- submsg = part.get_payload()
- # Don't HTML-escape it, this is the frontend's job
- ## BAW: I'm sure we can eventually do better than this. :(
- #decodedpayload = websafe(str(submsg))
- decodedpayload = str(submsg)
- return (counter, filebase+ext, ctype, charset, decodedpayload)