PageRenderTime 951ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/django_mailman3/lib/scrub.py

https://gitlab.com/thelinuxguy/django-mailman3
Python | 277 lines | 197 code | 13 blank | 67 comment | 27 complexity | 78437701b90123e979044e69848756ab MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright (C) 2016 by the Free Software Foundation, Inc.
  4. #
  5. # This program is free software; you can redistribute it and/or
  6. # modify it under the terms of the GNU General Public License
  7. # as published by the Free Software Foundation; either version 2
  8. # of the License, or (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
  18. # USA.
  19. """Cleanse a message for archiving."""
  20. from __future__ import absolute_import, unicode_literals
  21. import os
  22. import re
  23. import binascii
  24. from django.utils.six import integer_types
  25. from mimetypes import guess_all_extensions
  26. from email.header import decode_header, make_header
  27. from email.errors import HeaderParseError
  28. # Path characters for common platforms
  29. pre = re.compile(r'[/\\:]')
  30. # All other characters to strip out of Content-Disposition: filenames
  31. # (essentially anything that isn't an alphanum, dot, dash, or underscore).
  32. sre = re.compile(r'[^-\w.]')
  33. # Regexp to strip out leading dots
  34. dre = re.compile(r'^\.*')
  35. BR = '<br>\n'
  36. NEXT_PART = re.compile(r'--------------[ ]next[ ]part[ ]--------------\n')
  37. def guess_extension(ctype, ext):
  38. # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
  39. # and .wiz are all mapped to application/msword. This sucks for finding
  40. # the best reverse mapping. If the extension is one of the giving
  41. # mappings, we'll trust that, otherwise we'll just guess. :/
  42. all_exts = guess_all_extensions(ctype, strict=False)
  43. if ext in all_exts:
  44. return ext
  45. return all_exts and all_exts[0]
  46. def get_charset(message, default="ascii", guess=False):
  47. """
  48. Get the message charset.
  49. http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
  50. """
  51. if message.get_content_charset():
  52. return message.get_content_charset().decode("ascii")
  53. if message.get_charset():
  54. return message.get_charset().decode("ascii")
  55. charset = default
  56. if not guess:
  57. return charset
  58. # Try to guess the encoding (best effort mode)
  59. text = message.get_payload(decode=True)
  60. for encoding in ["ascii", "utf-8", "iso-8859-15"]:
  61. try:
  62. text.decode(encoding)
  63. except UnicodeDecodeError:
  64. continue
  65. else:
  66. charset = encoding
  67. break
  68. return charset
  69. def oneline(s):
  70. """Inspired by mailman.utilities.string.oneline"""
  71. try:
  72. h = make_header(decode_header(s))
  73. ustr = h.__unicode__()
  74. return ''.join(ustr.splitlines())
  75. except (LookupError, UnicodeError, ValueError, HeaderParseError):
  76. # possibly charset problem. return with undecoded string in one line.
  77. return ''.join(s.splitlines())
  78. class Scrubber(object):
  79. """
  80. Scrubs a single message, extracts attachments, and return the text and the
  81. attachments.
  82. http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
  83. """
  84. def __init__(self, msg):
  85. self.msg = msg
  86. def scrub(self):
  87. attachments = []
  88. sanitize = 1 # TODO: implement other options
  89. # Now walk over all subparts of this message and scrub out various
  90. # types
  91. for part_num, part in enumerate(self.msg.walk()):
  92. ctype = part.get_content_type()
  93. if not isinstance(ctype, unicode):
  94. ctype = ctype.decode("ascii")
  95. # If the part is text/plain, we leave it alone
  96. if ctype == 'text/plain':
  97. disposition = part.get('content-disposition')
  98. if disposition and disposition.decode(
  99. "ascii", "replace"
  100. ).strip().startswith("attachment"):
  101. # part is attached
  102. attachments.append(self.parse_attachment(part, part_num))
  103. part.set_payload('')
  104. elif ctype == 'text/html' and isinstance(sanitize, integer_types):
  105. if sanitize == 1:
  106. # Don't HTML-escape it, this is the frontend's job
  107. attachments.append(self.parse_attachment(
  108. part, part_num, filter_html=False))
  109. part.set_payload('')
  110. elif ctype == 'message/rfc822':
  111. # This part contains a submessage, so it too needs scrubbing
  112. attachments.append(self.parse_attachment(part, part_num))
  113. part.set_payload('')
  114. # If the message isn't a multipart, then we'll strip it out as an
  115. # attachment that would have to be separately downloaded.
  116. elif part.get_payload() and not part.is_multipart():
  117. payload = part.get_payload(decode=True)
  118. ctype = part.get_content_type()
  119. if not isinstance(ctype, unicode):
  120. ctype.decode("ascii")
  121. # XXX Under email 2.5, it is possible that payload will be
  122. # None. This can happen when you have a Content-Type:
  123. # multipart/* with only one part and that part has two blank
  124. # lines between the first boundary and the end boundary. In
  125. # email 3.0 you end up with a string in the payload. I think
  126. # in this case it's safe to ignore the part.
  127. if payload is None:
  128. continue
  129. attachments.append(self.parse_attachment(part, part_num))
  130. # We still have to sanitize multipart messages to flat text because
  131. # Pipermail can't handle messages with list payloads. This is a
  132. # kludge; def (n) clever hack ;).
  133. if self.msg.is_multipart():
  134. # We now want to concatenate all the parts which have been scrubbed
  135. # to text/plain, into a single text/plain payload. We need to make
  136. # sure all the characters in the concatenated string are in the
  137. # same encoding, so we'll use the 'replace' key in the coercion
  138. # call.
  139. # BAW: Martin's original patch suggested we might want to try
  140. # generalizing to utf-8, and that's probably a good idea
  141. # (eventually).
  142. text = []
  143. for part in self.msg.walk():
  144. # TK: bug-id 1099138 and multipart
  145. # MAS test payload - if part may fail if there are no headers.
  146. if not part.get_payload() or part.is_multipart():
  147. continue
  148. # All parts should be scrubbed to text/plain by now, except
  149. # if sanitize == 2, there could be text/html parts so keep them
  150. # but skip any other parts.
  151. partctype = part.get_content_type()
  152. if partctype != 'text/plain' and (partctype != 'text/html' or
  153. sanitize != 2):
  154. # text.append(_('Skipped content of type %(partctype)s\n'))
  155. continue
  156. try:
  157. t = part.get_payload(decode=True) or ''
  158. # MAS: TypeError exception can occur if payload is None. This
  159. # was observed with a message that contained an attached
  160. # message/delivery-status part. Because of the special parsing
  161. # of this type, this resulted in a text/plain sub-part with a
  162. # null body. See bug 1430236.
  163. except (binascii.Error, TypeError):
  164. t = part.get_payload() or ''
  165. partcharset = get_charset(part, guess=True)
  166. try:
  167. t = t.decode(partcharset, 'replace')
  168. except (UnicodeError, LookupError, ValueError,
  169. AssertionError):
  170. # We can get here if partcharset is bogus in some way.
  171. # Replace funny characters. We use errors='replace'
  172. t = t.decode('ascii', 'replace')
  173. # Separation is useful
  174. if isinstance(t, basestring):
  175. if not t.endswith('\n'):
  176. t += '\n'
  177. text.append(t)
  178. text = u"\n".join(text)
  179. else:
  180. text = self.msg.get_payload(decode=True)
  181. charset = get_charset(self.msg, guess=True)
  182. try:
  183. text = text.decode(charset, "replace")
  184. except (UnicodeError, LookupError, ValueError, AssertionError):
  185. text = text.decode('ascii', 'replace')
  186. next_part_match = NEXT_PART.search(text)
  187. if next_part_match:
  188. text = text[0:next_part_match.start(0)]
  189. return (text, attachments)
  190. def parse_attachment(self, part, counter, filter_html=True):
  191. # Store name, content-type and size
  192. # Figure out the attachment type and get the decoded data
  193. decodedpayload = part.get_payload(decode=True)
  194. # BAW: mimetypes ought to handle non-standard, but commonly found
  195. # types, e.g. image/jpg (should be image/jpeg). For now we just store
  196. # such things as application/octet-streams since that seems the safest.
  197. ctype = part.get_content_type()
  198. if not isinstance(ctype, unicode):
  199. ctype = ctype.decode("ascii")
  200. charset = get_charset(part, default=None, guess=False)
  201. # i18n file name is encoded
  202. try:
  203. filename = oneline(part.get_filename(''))
  204. except (TypeError, UnicodeDecodeError):
  205. # Workaround for https://bugs.launchpad.net/mailman/+bug/1060951
  206. # (accented filenames)
  207. filename = u"attachment.bin"
  208. filename, fnext = os.path.splitext(filename)
  209. # For safety, we should confirm this is valid ext for content-type
  210. # but we can use fnext if we introduce fnext filtering
  211. # TODO: re-implement this
  212. # if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
  213. # # HTML message doesn't have filename :-(
  214. # ext = fnext or guess_extension(ctype, fnext)
  215. # else:
  216. # ext = guess_extension(ctype, fnext)
  217. ext = fnext or guess_extension(ctype, fnext)
  218. if not ext:
  219. # We don't know what it is, so assume it's just a shapeless
  220. # application/octet-stream, unless the Content-Type: is
  221. # message/rfc822, in which case we know we'll coerce the type to
  222. # text/plain below.
  223. if ctype == 'message/rfc822':
  224. ext = '.txt'
  225. else:
  226. ext = '.bin'
  227. # Allow only alphanumerics, dash, underscore, and dot
  228. ext = sre.sub('', ext)
  229. # Now base the filename on what's in the attachment, uniquifying it if
  230. # necessary.
  231. if not filename:
  232. filebase = u'attachment'
  233. else:
  234. # Sanitize the filename given in the message headers
  235. parts = pre.split(filename)
  236. filename = parts[-1]
  237. # Strip off leading dots
  238. filename = dre.sub('', filename)
  239. # Allow only alphanumerics, dash, underscore, and dot
  240. # i18n filenames are not supported yet,
  241. # see https://bugs.launchpad.net/bugs/1060951
  242. filename = sre.sub('', filename)
  243. # If the filename's extension doesn't match the type we guessed,
  244. # which one should we go with? For now, let's go with the one we
  245. # guessed so attachments can't lie about their type. Also, if the
  246. # filename /has/ no extension, then tack on the one we guessed.
  247. # The extension was removed from the name above.
  248. filebase = filename
  249. # TODO: bring back the HTML sanitizer feature
  250. if ctype == 'message/rfc822':
  251. submsg = part.get_payload()
  252. # Don't HTML-escape it, this is the frontend's job
  253. decodedpayload = str(submsg)
  254. return (counter, filebase + ext, ctype, charset, decodedpayload)