PageRenderTime 976ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/django_mailman3/lib/scrub.py

https://gitlab.com/mailman/django-mailman3
Python | 284 lines | 182 code | 12 blank | 90 comment | 34 complexity | 43a9419e8f22c0b9955a8e0403b26f1a MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. # Copyright (C) 2017-2022 by the Free Software Foundation, Inc.
  3. #
  4. # This file is part of Django-Mailman.
  5. #
  6. # Django-Mailman3 is a free software: you can redistribute it and/or modify it
  7. # under the terms of the GNU General Public License as published by the Free
  8. # Software Foundation, either version 3 of the License, or (at your option) any
  9. # later version.
  10. #
  11. # Django-Mailman3 is distributed in the hope that it will be useful, but
  12. # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13. # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
  14. # more details.
  15. #
  16. # You should have received a copy of the GNU General Public License along with
  17. # Django-Mailman. If not, see <http://www.gnu.org/licenses/>.
  18. import os
  19. import re
  20. from email.errors import HeaderParseError
  21. from email.header import decode_header, make_header
  22. from email.message import EmailMessage
  23. from enum import Enum
  24. from mimetypes import guess_all_extensions
  25. # Path characters for common platforms
  26. PRE = re.compile(r'[/\\:]')
  27. # All other characters to strip out of Content-Disposition: filenames
  28. # (essentially anything that isn't an alphanum, dot, dash, or underscore).
  29. SRE = re.compile(r'[^-\w.]')
  30. # Regexp to strip out leading dots
  31. DRE = re.compile(r'^\.*')
  32. NEXT_PART = re.compile(r'--------------[ ]next[ ]part[ ]--------------\n')
  33. class Sanitize(Enum):
  34. """
  35. Enum to denote whether the HTML message should be scrubbed.
  36. """
  37. SANITIZE_HTML = 1
  38. def oneline(header_string):
  39. """Inspired by mailman.utilities.string.oneline"""
  40. try:
  41. h = make_header(decode_header(header_string))
  42. ustr = str(h)
  43. return ''.join(ustr.splitlines())
  44. except (LookupError, UnicodeError, ValueError, HeaderParseError):
  45. # possibly charset problem. return with undecoded string in one line.
  46. return ''.join(header_string.splitlines())
  47. class Scrubber():
  48. """
  49. Given an EmailMessage, extract all the attachments including text/html
  50. parts and return the text.
  51. """
  52. sanitize = Sanitize.SANITIZE_HTML
  53. def __init__(self, msg):
  54. assert isinstance(msg, EmailMessage)
  55. self.msg = msg
  56. def scrub(self):
  57. """Given a EmailMessage, extracts the text from the body and all the
  58. attachments.
  59. Returns a tuple (result, attachments), in which attachments is a list
  60. of all the attachments and result is unicode text of the message body.
  61. """
  62. attachments = self._get_all_attachments()
  63. text = self._get_text()
  64. return (text, attachments)
  65. def _get_all_attachments(self):
  66. attachments = []
  67. # We iterate over all the attachments using the new iter_attachments
  68. # API in EmailMessage. This returns all immediate children parts that
  69. # are not candidate body parts.
  70. for part_num, part in enumerate(self.msg.walk()):
  71. ctype = part.get_content_type()
  72. # Messages will *always* return a value for get_content_type, even
  73. # if message doesn't have one. If there is no content_type defined,
  74. # text/plain is returned for most message. In case of
  75. # multipart/digest, it is message/rfc822.
  76. if ctype == 'text/plain':
  77. if part.is_attachment():
  78. attachments.append(self._parse_attachment(part, part_num))
  79. part.set_content('\n')
  80. elif (ctype == 'text/html' and self.sanitize ==
  81. Sanitize.SANITIZE_HTML):
  82. attachments.append(self._parse_attachment(part, part_num))
  83. part.set_content('\n')
  84. elif ctype == 'message/rfc822':
  85. attachments.append(self._parse_attachment(part, part_num))
  86. part.set_content('\n')
  87. elif part.get_payload() and not part.is_multipart():
  88. attachments.append(self._parse_attachment(part, part_num))
  89. return attachments
  90. def _get_charset(self, msg, default='ascii', guess='False'):
  91. """
  92. Returns the charset of a EmailMessage part.
  93. If there is no charset defined, try to guess by decoding with certain
  94. common types.
  95. :param msg: The EmailMessage message to return charset for.
  96. :type msg: EmailMessage
  97. :param default: The charset to be assumed as default if none is defined
  98. :type default: str
  99. :param guess: Boolean defining whether we should try to guess the
  100. charset.
  101. :type guess: Bool
  102. """
  103. if msg.get_content_charset():
  104. return msg.get_content_charset()
  105. if msg.get_charset():
  106. return msg.get_charset()
  107. charset = default
  108. if not guess:
  109. # Do not try to guess the charset and just return the default.
  110. return charset
  111. text = msg.get_payload(decode=True)
  112. for encoding in ['ascii', 'utf8', 'iso8859-15']:
  113. try:
  114. text.decode(encoding)
  115. except UnicodeDecodeError:
  116. continue
  117. else:
  118. charset = encoding
  119. break
  120. return charset
  121. def _parse_attachment(self, part, part_num, filter_html=True):
  122. """
  123. Decode the attachment.
  124. :param part: Attachment to be parsed.
  125. :type part: EmailMessage
  126. :param part_num: An attachment numerical identifier
  127. :type part_num: int
  128. :filter_html: Whether filter HTML content from the text of attachment.
  129. :type filter_html: Bool
  130. """
  131. ctype = part.get_content_type()
  132. charset = self._get_charset(part, default=None, guess=False)
  133. try:
  134. payload = part.get_content()
  135. except LookupError as e:
  136. payload = "Can't retrieve content: {}".format(e)
  137. # get_content will raise KeyError if called on a multipart part. We
  138. # never call _parse_attachment() on multipart parts, so that's OK.
  139. # We have seen LookupError if the part's charset is unknown, so catch
  140. # that and just return a message.
  141. # XXX We could try some known charsets, but for now we just punt.
  142. #
  143. # get_content will return a string for text/* parts, an
  144. # EmailMessage object for message/rfc822 parts and bytes for other
  145. # content types. text/* parts will be CTE decoded and decoded per
  146. # their declared charset. Other parts will be CTE decoded.
  147. if ctype == 'message/rfc822':
  148. # Return message/rfc822 parts as a string.
  149. decodedpayload = str(payload)
  150. else:
  151. # It is a str or bytes, just return it as it is.
  152. decodedpayload = payload
  153. filename = self._get_attachment_filename(part, ctype)
  154. return (part_num, filename, ctype, charset, decodedpayload)
  155. def _guess_all_extensions(self, ctype):
  156. """
  157. Given the attachment's content-type, try to guess its file extension.
  158. """
  159. # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
  160. # and .wiz are all mapped to application/msword. This sucks for
  161. # finding the best reverse mapping. If the extension is one of the
  162. # giving mappings, we'll trust that, otherwise we'll just guess. :/
  163. all_exts = guess_all_extensions(ctype, strict=False)
  164. return all_exts and all_exts[0]
  165. def _get_attachment_filename(self, part, ctype):
  166. # Try to get the filename using the default `get_filename()`
  167. # API.
  168. try:
  169. filename = oneline(part.get_filename(''))
  170. except (TypeError, UnicodeDecodeError):
  171. # Workaround for https://bugs.launchpad.net/mailman/+bug/1060951
  172. # (accented filenames).
  173. # In Python3 get_filename decodes the filename with
  174. # `errors=replace` which means, that if there are non-ascii
  175. # characters in the filename, they are replaced with '?'.
  176. filename = 'attachment.bin'
  177. filename, fext = os.path.splitext(filename)
  178. ext = fext or self._guess_all_extensions(ctype)
  179. # Now that we have a guessed extension and if it returned no values,
  180. # let's cook up some extensions depending on the content type.
  181. if not ext:
  182. if ctype == 'message/rfc822':
  183. ext = '.txt'
  184. else:
  185. ext = '.bin'
  186. # Remove anything other than alphanum, dot, dash or underscore.
  187. ext = SRE.sub('', ext)
  188. if not filename:
  189. # Use attachment as default filename if there is none.
  190. filebase = 'attachment'
  191. else:
  192. # Sanitize the filename given in the message headers.
  193. parts = PRE.split(filename)
  194. filename = parts[-1]
  195. # Strip off the leading dots.
  196. filename = DRE.sub('', filename)
  197. # Allow only alphanumerics, dash, underscore, and dot
  198. # i18n filenames are not supported yet,
  199. # see https://bugs.launchpad.net/bugs/1060951
  200. filename = SRE.sub('', filename)
  201. # If the filename's extension doesn't match the type we guessed,
  202. # which one should we go with? For now, let's go with the one we
  203. # guessed so attachments can't lie about their type. Also, if the
  204. # filename /has/ no extension, then tack on the one we guessed.
  205. # The extension was removed from the name above.
  206. filebase = filename
  207. return filebase + ext
  208. def _get_text_one_part(self, msg):
  209. """
  210. Returns decoded payload for a non-multipart message.
  211. """
  212. # MAS: TypeError exception can occur if payload is None. This
  213. # was observed with a message that contained an attached
  214. # message/delivery-status part. Because of the special parsing
  215. # of this type, this resulted in a text/plain sub-part with a
  216. # null body. See bug 1430236.
  217. charset = self._get_charset(msg, guess=True)
  218. payload = msg.get_payload(decode=True)
  219. try:
  220. result = payload.decode(charset)
  221. except (UnicodeDecodeError, LookupError, ValueError, AssertionError):
  222. result = payload.decode('utf-8', 'replace')
  223. next_part_match = NEXT_PART.search(result)
  224. if next_part_match:
  225. result = result[0:next_part_match.start(0)]
  226. # MAS Remove any null butes from the result.
  227. result = re.sub('\x00', '', result)
  228. return result
  229. def _get_text(self):
  230. if self.msg.is_multipart():
  231. # We now want to concatenate all the parts which have been scrubbed
  232. # to text/plain, into a single text/plain payload. We need to make
  233. # sure all the characters in the concatenated string are in the
  234. # same encoding, so we'll use the 'replace' key in the coercion
  235. # call.
  236. # BAW: Martin's original patch suggested we might want to try
  237. # generalizing to utf-8, and that's probably a good idea
  238. # (eventually).
  239. text = []
  240. for part in self.msg.walk():
  241. # Walk through the message and collect all the plaintext parts
  242. # and leave all the multiparts.
  243. if part.is_multipart():
  244. continue
  245. ctype = part.get_content_type()
  246. # Ignore anything other text/plain and text/html
  247. if ctype != 'text/plain' and (
  248. ctype != 'text/html' or self.sanitize != 2):
  249. continue
  250. part_content = self._get_text_one_part(part)
  251. if isinstance(part_content, str):
  252. if not part_content.endswith('\n'):
  253. part_content += '\n'
  254. text.append(part_content)
  255. # MAS remove any null bytes from the text.
  256. return re.sub('\x00', '', '\n'.join(text))
  257. else:
  258. return self._get_text_one_part(self.msg)