scrub.py | searchcode

/django_mailman3/lib/scrub.py

https://gitlab.com/thelinuxguy/django-mailman3
Python | 277 lines | 197 code | 13 blank | 67 comment | 27 complexity | 78437701b90123e979044e69848756ab MD5 | raw file

# -*- coding: utf-8 -*-
#
# Copyright (C) 2016 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
# USA.

"""Cleanse a message for archiving."""

from __future__ import absolute_import, unicode_literals

import os
import re
import binascii

from django.utils.six import integer_types
from mimetypes import guess_all_extensions
from email.header import decode_header, make_header
from email.errors import HeaderParseError

# Path characters for common platforms
pre = re.compile(r'[/\\:]')
# All other characters to strip out of Content-Disposition: filenames
# (essentially anything that isn't an alphanum, dot, dash, or underscore).
sre = re.compile(r'[^-\w.]')
# Regexp to strip out leading dots
dre = re.compile(r'^\.*')

BR = '<br>\n'

NEXT_PART = re.compile(r'--------------[ ]next[ ]part[ ]--------------\n')


def guess_extension(ctype, ext):
    # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
    # and .wiz are all mapped to application/msword.  This sucks for finding
    # the best reverse mapping.  If the extension is one of the giving
    # mappings, we'll trust that, otherwise we'll just guess. :/
    all_exts = guess_all_extensions(ctype, strict=False)
    if ext in all_exts:
        return ext
    return all_exts and all_exts[0]


def get_charset(message, default="ascii", guess=False):
    """
    Get the message charset.

    http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
    """
    if message.get_content_charset():
        return message.get_content_charset().decode("ascii")
    if message.get_charset():
        return message.get_charset().decode("ascii")
    charset = default
    if not guess:
        return charset
    # Try to guess the encoding (best effort mode)
    text = message.get_payload(decode=True)
    for encoding in ["ascii", "utf-8", "iso-8859-15"]:
        try:
            text.decode(encoding)
        except UnicodeDecodeError:
            continue
        else:
            charset = encoding
            break
    return charset


def oneline(s):
    """Inspired by mailman.utilities.string.oneline"""
    try:
        h = make_header(decode_header(s))
        ustr = h.__unicode__()
        return ''.join(ustr.splitlines())
    except (LookupError, UnicodeError, ValueError, HeaderParseError):
        # possibly charset problem. return with undecoded string in one line.
        return ''.join(s.splitlines())


class Scrubber(object):
    """
    Scrubs a single message, extracts attachments, and return the text and the
    attachments.

    http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
    """

    def __init__(self, msg):
        self.msg = msg

    def scrub(self):
        attachments = []
        sanitize = 1  # TODO: implement other options
        # Now walk over all subparts of this message and scrub out various
        # types
        for part_num, part in enumerate(self.msg.walk()):
            ctype = part.get_content_type()
            if not isinstance(ctype, unicode):
                ctype = ctype.decode("ascii")
            # If the part is text/plain, we leave it alone
            if ctype == 'text/plain':
                disposition = part.get('content-disposition')
                if disposition and disposition.decode(
                        "ascii", "replace"
                        ).strip().startswith("attachment"):
                    # part is attached
                    attachments.append(self.parse_attachment(part, part_num))
                    part.set_payload('')
            elif ctype == 'text/html' and isinstance(sanitize, integer_types):
                if sanitize == 1:
                    # Don't HTML-escape it, this is the frontend's job
                    attachments.append(self.parse_attachment(
                        part, part_num, filter_html=False))
                    part.set_payload('')
            elif ctype == 'message/rfc822':
                # This part contains a submessage, so it too needs scrubbing
                attachments.append(self.parse_attachment(part, part_num))
                part.set_payload('')
            # If the message isn't a multipart, then we'll strip it out as an
            # attachment that would have to be separately downloaded.
            elif part.get_payload() and not part.is_multipart():
                payload = part.get_payload(decode=True)
                ctype = part.get_content_type()
                if not isinstance(ctype, unicode):
                    ctype.decode("ascii")
                # XXX Under email 2.5, it is possible that payload will be
                # None. This can happen when you have a Content-Type:
                # multipart/* with only one part and that part has two blank
                # lines between the first boundary and the end boundary.  In
                # email 3.0 you end up with a string in the payload.  I think
                # in this case it's safe to ignore the part.
                if payload is None:
                    continue
                attachments.append(self.parse_attachment(part, part_num))
        # We still have to sanitize multipart messages to flat text because
        # Pipermail can't handle messages with list payloads.  This is a
        # kludge; def (n) clever hack ;).
        if self.msg.is_multipart():
            # We now want to concatenate all the parts which have been scrubbed
            # to text/plain, into a single text/plain payload.  We need to make
            # sure all the characters in the concatenated string are in the
            # same encoding, so we'll use the 'replace' key in the coercion
            # call.
            # BAW: Martin's original patch suggested we might want to try
            # generalizing to utf-8, and that's probably a good idea
            # (eventually).
            text = []
            for part in self.msg.walk():
                # TK: bug-id 1099138 and multipart
                # MAS test payload - if part may fail if there are no headers.
                if not part.get_payload() or part.is_multipart():
                    continue
                # All parts should be scrubbed to text/plain by now, except
                # if sanitize == 2, there could be text/html parts so keep them
                # but skip any other parts.
                partctype = part.get_content_type()
                if partctype != 'text/plain' and (partctype != 'text/html' or
                                                  sanitize != 2):
                    # text.append(_('Skipped content of type %(partctype)s\n'))
                    continue
                try:
                    t = part.get_payload(decode=True) or ''
                # MAS: TypeError exception can occur if payload is None. This
                # was observed with a message that contained an attached
                # message/delivery-status part. Because of the special parsing
                # of this type, this resulted in a text/plain sub-part with a
                # null body. See bug 1430236.
                except (binascii.Error, TypeError):
                    t = part.get_payload() or ''
                partcharset = get_charset(part, guess=True)
                try:
                    t = t.decode(partcharset, 'replace')
                except (UnicodeError, LookupError, ValueError,
                        AssertionError):
                    # We can get here if partcharset is bogus in some way.
                    # Replace funny characters.  We use errors='replace'
                    t = t.decode('ascii', 'replace')
                # Separation is useful
                if isinstance(t, basestring):
                    if not t.endswith('\n'):
                        t += '\n'
                    text.append(t)

            text = u"\n".join(text)
        else:
            text = self.msg.get_payload(decode=True)
            charset = get_charset(self.msg, guess=True)
            try:
                text = text.decode(charset, "replace")
            except (UnicodeError, LookupError, ValueError, AssertionError):
                text = text.decode('ascii', 'replace')

            next_part_match = NEXT_PART.search(text)
            if next_part_match:
                text = text[0:next_part_match.start(0)]

        return (text, attachments)

    def parse_attachment(self, part, counter, filter_html=True):
        # Store name, content-type and size
        # Figure out the attachment type and get the decoded data
        decodedpayload = part.get_payload(decode=True)
        # BAW: mimetypes ought to handle non-standard, but commonly found
        # types, e.g. image/jpg (should be image/jpeg).  For now we just store
        # such things as application/octet-streams since that seems the safest.
        ctype = part.get_content_type()
        if not isinstance(ctype, unicode):
            ctype = ctype.decode("ascii")
        charset = get_charset(part, default=None, guess=False)
        # i18n file name is encoded
        try:
            filename = oneline(part.get_filename(''))
        except (TypeError, UnicodeDecodeError):
            # Workaround for https://bugs.launchpad.net/mailman/+bug/1060951
            # (accented filenames)
            filename = u"attachment.bin"
        filename, fnext = os.path.splitext(filename)
        # For safety, we should confirm this is valid ext for content-type
        # but we can use fnext if we introduce fnext filtering
        # TODO: re-implement this
        # if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
        #     # HTML message doesn't have filename :-(
        #     ext = fnext or guess_extension(ctype, fnext)
        # else:
        #     ext = guess_extension(ctype, fnext)
        ext = fnext or guess_extension(ctype, fnext)
        if not ext:
            # We don't know what it is, so assume it's just a shapeless
            # application/octet-stream, unless the Content-Type: is
            # message/rfc822, in which case we know we'll coerce the type to
            # text/plain below.
            if ctype == 'message/rfc822':
                ext = '.txt'
            else:
                ext = '.bin'
        # Allow only alphanumerics, dash, underscore, and dot
        ext = sre.sub('', ext)
        # Now base the filename on what's in the attachment, uniquifying it if
        # necessary.
        if not filename:
            filebase = u'attachment'
        else:
            # Sanitize the filename given in the message headers
            parts = pre.split(filename)
            filename = parts[-1]
            # Strip off leading dots
            filename = dre.sub('', filename)
            # Allow only alphanumerics, dash, underscore, and dot
            # i18n filenames are not supported yet,
            # see https://bugs.launchpad.net/bugs/1060951
            filename = sre.sub('', filename)
            # If the filename's extension doesn't match the type we guessed,
            # which one should we go with?  For now, let's go with the one we
            # guessed so attachments can't lie about their type.  Also, if the
            # filename /has/ no extension, then tack on the one we guessed.
            # The extension was removed from the name above.
            filebase = filename
        # TODO: bring back the HTML sanitizer feature
        if ctype == 'message/rfc822':
            submsg = part.get_payload()
            # Don't HTML-escape it, this is the frontend's job
            decodedpayload = str(submsg)
        return (counter, filebase + ext, ctype, charset, decodedpayload)