PageRenderTime 38ms CodeModel.GetById 29ms app.highlight 7ms RepoModel.GetById 0ms app.codeStats 0ms

/django_mailman3/lib/scrub.py

https://gitlab.com/mailman/django-mailman3
Python | 284 lines | 182 code | 12 blank | 90 comment | 26 complexity | 43a9419e8f22c0b9955a8e0403b26f1a MD5 | raw file
  1# -*- coding: utf-8 -*-
  2# Copyright (C) 2017-2022 by the Free Software Foundation, Inc.
  3#
  4# This file is part of Django-Mailman.
  5#
  6# Django-Mailman3 is a free software: you can redistribute it and/or modify it
  7# under the terms of the GNU General Public License as published by the Free
  8# Software Foundation, either version 3 of the License, or (at your option) any
  9# later version.
 10#
 11# Django-Mailman3 is distributed in the hope that it will be useful, but
 12# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 14# more details.
 15#
 16# You should have received a copy of the GNU General Public License along with
 17# Django-Mailman.  If not, see <http://www.gnu.org/licenses/>.
 18
 19import os
 20import re
 21from email.errors import HeaderParseError
 22from email.header import decode_header, make_header
 23from email.message import EmailMessage
 24from enum import Enum
 25from mimetypes import guess_all_extensions
 26
 27
 28# Path characters for common platforms
 29PRE = re.compile(r'[/\\:]')
 30# All other characters to strip out of Content-Disposition: filenames
 31# (essentially anything that isn't an alphanum, dot, dash, or underscore).
 32SRE = re.compile(r'[^-\w.]')
 33# Regexp to strip out leading dots
 34DRE = re.compile(r'^\.*')
 35
 36NEXT_PART = re.compile(r'--------------[ ]next[ ]part[ ]--------------\n')
 37
 38
 39class Sanitize(Enum):
 40    """
 41    Enum to denote whether the HTML message should be scrubbed.
 42    """
 43    SANITIZE_HTML = 1
 44
 45
 46def oneline(header_string):
 47    """Inspired by mailman.utilities.string.oneline"""
 48    try:
 49        h = make_header(decode_header(header_string))
 50        ustr = str(h)
 51        return ''.join(ustr.splitlines())
 52    except (LookupError, UnicodeError, ValueError, HeaderParseError):
 53        # possibly charset problem. return with undecoded string in one line.
 54        return ''.join(header_string.splitlines())
 55
 56
 57class Scrubber():
 58    """
 59    Given an EmailMessage, extract all the attachments including text/html
 60    parts and return the text.
 61    """
 62
 63    sanitize = Sanitize.SANITIZE_HTML
 64
 65    def __init__(self, msg):
 66        assert isinstance(msg, EmailMessage)
 67        self.msg = msg
 68
 69    def scrub(self):
 70        """Given a EmailMessage, extracts the text from the body and all the
 71        attachments.
 72
 73        Returns a tuple (result, attachments), in which attachments is a list
 74        of all the attachments and result is unicode text of the message body.
 75
 76        """
 77        attachments = self._get_all_attachments()
 78        text = self._get_text()
 79        return (text, attachments)
 80
 81    def _get_all_attachments(self):
 82        attachments = []
 83        # We iterate over all the attachments using the new iter_attachments
 84        # API in EmailMessage. This returns all immediate children parts that
 85        # are not candidate body parts.
 86        for part_num, part in enumerate(self.msg.walk()):
 87            ctype = part.get_content_type()
 88            # Messages will *always* return a value for get_content_type, even
 89            # if message doesn't have one. If there is no content_type defined,
 90            # text/plain is returned for most message. In case of
 91            # multipart/digest, it is message/rfc822.
 92            if ctype == 'text/plain':
 93                if part.is_attachment():
 94                    attachments.append(self._parse_attachment(part, part_num))
 95                    part.set_content('\n')
 96            elif (ctype == 'text/html' and self.sanitize ==
 97                  Sanitize.SANITIZE_HTML):
 98                attachments.append(self._parse_attachment(part, part_num))
 99                part.set_content('\n')
100            elif ctype == 'message/rfc822':
101                attachments.append(self._parse_attachment(part, part_num))
102                part.set_content('\n')
103            elif part.get_payload() and not part.is_multipart():
104                attachments.append(self._parse_attachment(part, part_num))
105        return attachments
106
107    def _get_charset(self, msg, default='ascii', guess='False'):
108        """
109        Returns the charset of a EmailMessage part.
110
111        If there is no charset defined, try to guess by decoding with certain
112        common types.
113
114        :param msg: The EmailMessage message to return charset for.
115        :type msg: EmailMessage
116        :param default: The charset to be assumed as default if none is defined
117        :type default: str
118        :param guess: Boolean defining whether we should try to guess the
119                      charset.
120        :type guess: Bool
121        """
122        if msg.get_content_charset():
123            return msg.get_content_charset()
124        if msg.get_charset():
125            return msg.get_charset()
126        charset = default
127        if not guess:
128            # Do not try to guess the charset and just return the default.
129            return charset
130        text = msg.get_payload(decode=True)
131        for encoding in ['ascii', 'utf8', 'iso8859-15']:
132            try:
133                text.decode(encoding)
134            except UnicodeDecodeError:
135                continue
136            else:
137                charset = encoding
138                break
139        return charset
140
141    def _parse_attachment(self, part, part_num, filter_html=True):
142        """
143        Decode the attachment.
144
145        :param part: Attachment to be parsed.
146        :type part: EmailMessage
147        :param part_num: An attachment numerical identifier
148        :type part_num: int
149        :filter_html: Whether filter HTML content from the text of attachment.
150        :type filter_html: Bool
151        """
152        ctype = part.get_content_type()
153        charset = self._get_charset(part, default=None, guess=False)
154        try:
155            payload = part.get_content()
156        except LookupError as e:
157            payload = "Can't retrieve content: {}".format(e)
158        # get_content will raise KeyError if called on a multipart part.  We
159        # never call _parse_attachment() on multipart parts, so that's OK.
160        # We have seen LookupError if the part's charset is unknown, so catch
161        # that and just return a message.
162        # XXX We could try some known charsets, but for now we just punt.
163        #
164        # get_content will return a string for text/* parts, an
165        # EmailMessage object for message/rfc822 parts and bytes for other
166        # content types.  text/* parts will be CTE decoded and decoded per
167        # their declared charset.  Other parts will be CTE decoded.
168        if ctype == 'message/rfc822':
169            # Return message/rfc822 parts as a string.
170            decodedpayload = str(payload)
171        else:
172            # It is a str or bytes, just return it as it is.
173            decodedpayload = payload
174        filename = self._get_attachment_filename(part, ctype)
175        return (part_num, filename, ctype, charset, decodedpayload)
176
177    def _guess_all_extensions(self, ctype):
178        """
179        Given the attachment's content-type, try to guess its file extension.
180        """
181        # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
182        # and .wiz are all mapped to application/msword.  This sucks for
183        # finding the best reverse mapping.  If the extension is one of the
184        # giving mappings, we'll trust that, otherwise we'll just guess. :/
185        all_exts = guess_all_extensions(ctype, strict=False)
186        return all_exts and all_exts[0]
187
188    def _get_attachment_filename(self, part, ctype):
189        # Try to get the filename using the default `get_filename()`
190        # API.
191        try:
192            filename = oneline(part.get_filename(''))
193        except (TypeError, UnicodeDecodeError):
194            # Workaround for https://bugs.launchpad.net/mailman/+bug/1060951
195            # (accented filenames).
196            # In Python3 get_filename decodes the filename with
197            # `errors=replace` which means, that if there are non-ascii
198            # characters in the filename, they are replaced with '?'.
199            filename = 'attachment.bin'
200
201        filename, fext = os.path.splitext(filename)
202        ext = fext or self._guess_all_extensions(ctype)
203        # Now that we have a guessed extension and if it returned no values,
204        # let's cook up some extensions depending on the content type.
205        if not ext:
206            if ctype == 'message/rfc822':
207                ext = '.txt'
208            else:
209                ext = '.bin'
210        # Remove anything other than alphanum, dot, dash or underscore.
211        ext = SRE.sub('', ext)
212        if not filename:
213            # Use attachment as default filename if there is none.
214            filebase = 'attachment'
215        else:
216            # Sanitize the filename given in the message headers.
217            parts = PRE.split(filename)
218            filename = parts[-1]
219            # Strip off the leading dots.
220            filename = DRE.sub('', filename)
221            # Allow only alphanumerics, dash, underscore, and dot
222            # i18n filenames are not supported yet,
223            # see https://bugs.launchpad.net/bugs/1060951
224            filename = SRE.sub('', filename)
225            # If the filename's extension doesn't match the type we guessed,
226            # which one should we go with?  For now, let's go with the one we
227            # guessed so attachments can't lie about their type.  Also, if the
228            # filename /has/ no extension, then tack on the one we guessed.
229            # The extension was removed from the name above.
230            filebase = filename
231        return filebase + ext
232
233    def _get_text_one_part(self, msg):
234        """
235        Returns decoded payload for a non-multipart message.
236        """
237        # MAS: TypeError exception can occur if payload is None. This
238        # was observed with a message that contained an attached
239        # message/delivery-status part. Because of the special parsing
240        # of this type, this resulted in a text/plain sub-part with a
241        # null body. See bug 1430236.
242        charset = self._get_charset(msg, guess=True)
243        payload = msg.get_payload(decode=True)
244        try:
245            result = payload.decode(charset)
246        except (UnicodeDecodeError, LookupError, ValueError, AssertionError):
247            result = payload.decode('utf-8', 'replace')
248        next_part_match = NEXT_PART.search(result)
249        if next_part_match:
250            result = result[0:next_part_match.start(0)]
251        # MAS Remove any null butes from the result.
252        result = re.sub('\x00', '', result)
253        return result
254
255    def _get_text(self):
256        if self.msg.is_multipart():
257            # We now want to concatenate all the parts which have been scrubbed
258            # to text/plain, into a single text/plain payload.  We need to make
259            # sure all the characters in the concatenated string are in the
260            # same encoding, so we'll use the 'replace' key in the coercion
261            # call.
262            # BAW: Martin's original patch suggested we might want to try
263            # generalizing to utf-8, and that's probably a good idea
264            # (eventually).
265            text = []
266            for part in self.msg.walk():
267                # Walk through the message and collect all the plaintext parts
268                # and leave all the multiparts.
269                if part.is_multipart():
270                    continue
271                ctype = part.get_content_type()
272                # Ignore anything other text/plain and text/html
273                if ctype != 'text/plain' and (
274                        ctype != 'text/html' or self.sanitize != 2):
275                    continue
276                part_content = self._get_text_one_part(part)
277                if isinstance(part_content, str):
278                    if not part_content.endswith('\n'):
279                        part_content += '\n'
280                text.append(part_content)
281            # MAS remove any null bytes from the text.
282            return re.sub('\x00', '', '\n'.join(text))
283        else:
284            return self._get_text_one_part(self.msg)