scrub.py | searchcode

/django_mailman3/lib/scrub.py

https://gitlab.com/mailman/django-mailman3
Python | 284 lines | 182 code | 12 blank | 90 comment | 34 complexity | 43a9419e8f22c0b9955a8e0403b26f1a MD5 | raw file

# -*- coding: utf-8 -*-
# Copyright (C) 2017-2022 by the Free Software Foundation, Inc.
#
# This file is part of Django-Mailman.
#
# Django-Mailman3 is a free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# Django-Mailman3 is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# Django-Mailman.  If not, see <http://www.gnu.org/licenses/>.

import os
import re
from email.errors import HeaderParseError
from email.header import decode_header, make_header
from email.message import EmailMessage
from enum import Enum
from mimetypes import guess_all_extensions


# Path characters for common platforms
PRE = re.compile(r'[/\\:]')
# All other characters to strip out of Content-Disposition: filenames
# (essentially anything that isn't an alphanum, dot, dash, or underscore).
SRE = re.compile(r'[^-\w.]')
# Regexp to strip out leading dots
DRE = re.compile(r'^\.*')

NEXT_PART = re.compile(r'--------------[ ]next[ ]part[ ]--------------\n')


class Sanitize(Enum):
    """
    Enum to denote whether the HTML message should be scrubbed.
    """
    SANITIZE_HTML = 1


def oneline(header_string):
    """Inspired by mailman.utilities.string.oneline"""
    try:
        h = make_header(decode_header(header_string))
        ustr = str(h)
        return ''.join(ustr.splitlines())
    except (LookupError, UnicodeError, ValueError, HeaderParseError):
        # possibly charset problem. return with undecoded string in one line.
        return ''.join(header_string.splitlines())


class Scrubber():
    """
    Given an EmailMessage, extract all the attachments including text/html
    parts and return the text.
    """

    sanitize = Sanitize.SANITIZE_HTML

    def __init__(self, msg):
        assert isinstance(msg, EmailMessage)
        self.msg = msg

    def scrub(self):
        """Given a EmailMessage, extracts the text from the body and all the
        attachments.

        Returns a tuple (result, attachments), in which attachments is a list
        of all the attachments and result is unicode text of the message body.

        """
        attachments = self._get_all_attachments()
        text = self._get_text()
        return (text, attachments)

    def _get_all_attachments(self):
        attachments = []
        # We iterate over all the attachments using the new iter_attachments
        # API in EmailMessage. This returns all immediate children parts that
        # are not candidate body parts.
        for part_num, part in enumerate(self.msg.walk()):
            ctype = part.get_content_type()
            # Messages will *always* return a value for get_content_type, even
            # if message doesn't have one. If there is no content_type defined,
            # text/plain is returned for most message. In case of
            # multipart/digest, it is message/rfc822.
            if ctype == 'text/plain':
                if part.is_attachment():
                    attachments.append(self._parse_attachment(part, part_num))
                    part.set_content('\n')
            elif (ctype == 'text/html' and self.sanitize ==
                  Sanitize.SANITIZE_HTML):
                attachments.append(self._parse_attachment(part, part_num))
                part.set_content('\n')
            elif ctype == 'message/rfc822':
                attachments.append(self._parse_attachment(part, part_num))
                part.set_content('\n')
            elif part.get_payload() and not part.is_multipart():
                attachments.append(self._parse_attachment(part, part_num))
        return attachments

    def _get_charset(self, msg, default='ascii', guess='False'):
        """
        Returns the charset of a EmailMessage part.

        If there is no charset defined, try to guess by decoding with certain
        common types.

        :param msg: The EmailMessage message to return charset for.
        :type msg: EmailMessage
        :param default: The charset to be assumed as default if none is defined
        :type default: str
        :param guess: Boolean defining whether we should try to guess the
                      charset.
        :type guess: Bool
        """
        if msg.get_content_charset():
            return msg.get_content_charset()
        if msg.get_charset():
            return msg.get_charset()
        charset = default
        if not guess:
            # Do not try to guess the charset and just return the default.
            return charset
        text = msg.get_payload(decode=True)
        for encoding in ['ascii', 'utf8', 'iso8859-15']:
            try:
                text.decode(encoding)
            except UnicodeDecodeError:
                continue
            else:
                charset = encoding
                break
        return charset

    def _parse_attachment(self, part, part_num, filter_html=True):
        """
        Decode the attachment.

        :param part: Attachment to be parsed.
        :type part: EmailMessage
        :param part_num: An attachment numerical identifier
        :type part_num: int
        :filter_html: Whether filter HTML content from the text of attachment.
        :type filter_html: Bool
        """
        ctype = part.get_content_type()
        charset = self._get_charset(part, default=None, guess=False)
        try:
            payload = part.get_content()
        except LookupError as e:
            payload = "Can't retrieve content: {}".format(e)
        # get_content will raise KeyError if called on a multipart part.  We
        # never call _parse_attachment() on multipart parts, so that's OK.
        # We have seen LookupError if the part's charset is unknown, so catch
        # that and just return a message.
        # XXX We could try some known charsets, but for now we just punt.
        #
        # get_content will return a string for text/* parts, an
        # EmailMessage object for message/rfc822 parts and bytes for other
        # content types.  text/* parts will be CTE decoded and decoded per
        # their declared charset.  Other parts will be CTE decoded.
        if ctype == 'message/rfc822':
            # Return message/rfc822 parts as a string.
            decodedpayload = str(payload)
        else:
            # It is a str or bytes, just return it as it is.
            decodedpayload = payload
        filename = self._get_attachment_filename(part, ctype)
        return (part_num, filename, ctype, charset, decodedpayload)

    def _guess_all_extensions(self, ctype):
        """
        Given the attachment's content-type, try to guess its file extension.
        """
        # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
        # and .wiz are all mapped to application/msword.  This sucks for
        # finding the best reverse mapping.  If the extension is one of the
        # giving mappings, we'll trust that, otherwise we'll just guess. :/
        all_exts = guess_all_extensions(ctype, strict=False)
        return all_exts and all_exts[0]

    def _get_attachment_filename(self, part, ctype):
        # Try to get the filename using the default `get_filename()`
        # API.
        try:
            filename = oneline(part.get_filename(''))
        except (TypeError, UnicodeDecodeError):
            # Workaround for https://bugs.launchpad.net/mailman/+bug/1060951
            # (accented filenames).
            # In Python3 get_filename decodes the filename with
            # `errors=replace` which means, that if there are non-ascii
            # characters in the filename, they are replaced with '?'.
            filename = 'attachment.bin'

        filename, fext = os.path.splitext(filename)
        ext = fext or self._guess_all_extensions(ctype)
        # Now that we have a guessed extension and if it returned no values,
        # let's cook up some extensions depending on the content type.
        if not ext:
            if ctype == 'message/rfc822':
                ext = '.txt'
            else:
                ext = '.bin'
        # Remove anything other than alphanum, dot, dash or underscore.
        ext = SRE.sub('', ext)
        if not filename:
            # Use attachment as default filename if there is none.
            filebase = 'attachment'
        else:
            # Sanitize the filename given in the message headers.
            parts = PRE.split(filename)
            filename = parts[-1]
            # Strip off the leading dots.
            filename = DRE.sub('', filename)
            # Allow only alphanumerics, dash, underscore, and dot
            # i18n filenames are not supported yet,
            # see https://bugs.launchpad.net/bugs/1060951
            filename = SRE.sub('', filename)
            # If the filename's extension doesn't match the type we guessed,
            # which one should we go with?  For now, let's go with the one we
            # guessed so attachments can't lie about their type.  Also, if the
            # filename /has/ no extension, then tack on the one we guessed.
            # The extension was removed from the name above.
            filebase = filename
        return filebase + ext

    def _get_text_one_part(self, msg):
        """
        Returns decoded payload for a non-multipart message.
        """
        # MAS: TypeError exception can occur if payload is None. This
        # was observed with a message that contained an attached
        # message/delivery-status part. Because of the special parsing
        # of this type, this resulted in a text/plain sub-part with a
        # null body. See bug 1430236.
        charset = self._get_charset(msg, guess=True)
        payload = msg.get_payload(decode=True)
        try:
            result = payload.decode(charset)
        except (UnicodeDecodeError, LookupError, ValueError, AssertionError):
            result = payload.decode('utf-8', 'replace')
        next_part_match = NEXT_PART.search(result)
        if next_part_match:
            result = result[0:next_part_match.start(0)]
        # MAS Remove any null butes from the result.
        result = re.sub('\x00', '', result)
        return result

    def _get_text(self):
        if self.msg.is_multipart():
            # We now want to concatenate all the parts which have been scrubbed
            # to text/plain, into a single text/plain payload.  We need to make
            # sure all the characters in the concatenated string are in the
            # same encoding, so we'll use the 'replace' key in the coercion
            # call.
            # BAW: Martin's original patch suggested we might want to try
            # generalizing to utf-8, and that's probably a good idea
            # (eventually).
            text = []
            for part in self.msg.walk():
                # Walk through the message and collect all the plaintext parts
                # and leave all the multiparts.
                if part.is_multipart():
                    continue
                ctype = part.get_content_type()
                # Ignore anything other text/plain and text/html
                if ctype != 'text/plain' and (
                        ctype != 'text/html' or self.sanitize != 2):
                    continue
                part_content = self._get_text_one_part(part)
                if isinstance(part_content, str):
                    if not part_content.endswith('\n'):
                        part_content += '\n'
                text.append(part_content)
            # MAS remove any null bytes from the text.
            return re.sub('\x00', '', '\n'.join(text))
        else:
            return self._get_text_one_part(self.msg)