PageRenderTime 26ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 1ms

/hyperkitty/lib/scrub.py

https://gitlab.com/mathuin/hyperkitty
Python | 311 lines | 224 code | 12 blank | 75 comment | 28 complexity | 0fea7194a6b8dee8fb1b3ab7c75d87a2 MD5 | raw file
  1. # Copyright (C) 2011-2012 by the Free Software Foundation, Inc.
  2. #
  3. # This program is free software; you can redistribute it and/or
  4. # modify it under the terms of the GNU General Public License
  5. # as published by the Free Software Foundation; either version 2
  6. # of the License, or (at your option) any later version.
  7. #
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program; if not, write to the Free Software
  15. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
  16. # USA.
  17. """Cleanse a message for archiving."""
  18. from __future__ import absolute_import, unicode_literals
  19. import os
  20. import re
  21. import binascii
  22. from types import IntType
  23. from mimetypes import guess_all_extensions
  24. from email.header import decode_header, make_header
  25. from email.errors import HeaderParseError
  26. # Path characters for common platforms
  27. pre = re.compile(r'[/\\:]')
  28. # All other characters to strip out of Content-Disposition: filenames
  29. # (essentially anything that isn't an alphanum, dot, dash, or underscore).
  30. sre = re.compile(r'[^-\w.]')
  31. # Regexp to strip out leading dots
  32. dre = re.compile(r'^\.*')
  33. BR = '<br>\n'
  34. NEXT_PART = re.compile(r'--------------[ ]next[ ]part[ ]--------------\n')
  35. def guess_extension(ctype, ext):
  36. # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
  37. # and .wiz are all mapped to application/msword. This sucks for finding
  38. # the best reverse mapping. If the extension is one of the giving
  39. # mappings, we'll trust that, otherwise we'll just guess. :/
  40. all_exts = guess_all_extensions(ctype, strict=False)
  41. if ext in all_exts:
  42. return ext
  43. return all_exts and all_exts[0]
  44. def get_charset(message, default="ascii", guess=False):
  45. """
  46. Get the message charset.
  47. From: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
  48. """
  49. if message.get_content_charset():
  50. return message.get_content_charset().decode("ascii")
  51. if message.get_charset():
  52. return message.get_charset().decode("ascii")
  53. charset = default
  54. if not guess:
  55. return charset
  56. # Try to guess the encoding (best effort mode)
  57. text = message.get_payload(decode=True)
  58. for encoding in ["ascii", "utf-8", "iso-8859-15"]:
  59. try:
  60. text.decode(encoding)
  61. except UnicodeDecodeError:
  62. continue
  63. else:
  64. charset = encoding
  65. break
  66. return charset
  67. def oneline(s):
  68. """Inspired by mailman.utilities.string.oneline"""
  69. try:
  70. h = make_header(decode_header(s))
  71. ustr = h.__unicode__()
  72. return ''.join(ustr.splitlines())
  73. except (LookupError, UnicodeError, ValueError, HeaderParseError):
  74. # possibly charset problem. return with undecoded string in one line.
  75. return ''.join(s.splitlines())
  76. class Scrubber(object):
  77. """
  78. Scrubs a single message, extracts attachments, and return the text and the
  79. attachments.
  80. See also: http://ginstrom.com/scribbles/2007/11/19/parsing-multilingual-email-with-python/
  81. """
  82. def __init__(self, mlist, msg):
  83. self.mlist = mlist
  84. self.msg = msg
  85. def scrub(self):
  86. attachments = []
  87. sanitize = 1 # TODO: implement other options
  88. #outer = True
  89. # Now walk over all subparts of this message and scrub out various types
  90. for part_num, part in enumerate(self.msg.walk()):
  91. ctype = part.get_content_type()
  92. if not isinstance(ctype, unicode):
  93. ctype = ctype.decode("ascii")
  94. # If the part is text/plain, we leave it alone
  95. if ctype == 'text/plain':
  96. disposition = part.get('content-disposition')
  97. if disposition and disposition.decode("ascii", "replace"
  98. ).strip().startswith("attachment"):
  99. # part is attached
  100. attachments.append(self.parse_attachment(part, part_num))
  101. part.set_payload('')
  102. elif ctype == 'text/html' and isinstance(sanitize, IntType):
  103. # if sanitize == 0:
  104. # if outer:
  105. # raise DiscardMessage
  106. # replace_payload_by_text(part,
  107. # _('HTML attachment scrubbed and removed'),
  108. # # Adding charset arg and removing content-type
  109. # # sets content-type to text/plain
  110. # lcset)
  111. # elif sanitize == 2:
  112. # # By leaving it alone, Pipermail will automatically escape it
  113. # pass
  114. # elif sanitize == 3:
  115. # # Pull it out as an attachment but leave it unescaped. This
  116. # # is dangerous, but perhaps useful for heavily moderated
  117. # # lists.
  118. # attachments.append(self.parse_attachment(part, part_num, filter_html=False))
  119. # replace_payload_by_text(part, _("""\
  120. #An HTML attachment was scrubbed...
  121. #URL: %(url)s
  122. #"""), lcset)
  123. # else:
  124. if sanitize == 1:
  125. # Don't HTML-escape it, this is the frontend's job
  126. ## HTML-escape it and store it as an attachment, but make it
  127. ## look a /little/ bit prettier. :(
  128. #payload = websafe(part.get_payload(decode=True))
  129. ## For whitespace in the margin, change spaces into
  130. ## non-breaking spaces, and tabs into 8 of those. Then use a
  131. ## mono-space font. Still looks hideous to me, but then I'd
  132. ## just as soon discard them.
  133. #def doreplace(s):
  134. # return s.expandtabs(8).replace(' ', '&nbsp;')
  135. #lines = [doreplace(s) for s in payload.split('\n')]
  136. #payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
  137. #part.set_payload(payload)
  138. ## We're replacing the payload with the decoded payload so this
  139. ## will just get in the way.
  140. #del part['content-transfer-encoding']
  141. attachments.append(self.parse_attachment(part, part_num, filter_html=False))
  142. part.set_payload('')
  143. elif ctype == 'message/rfc822':
  144. # This part contains a submessage, so it too needs scrubbing
  145. attachments.append(self.parse_attachment(part, part_num))
  146. part.set_payload('')
  147. # If the message isn't a multipart, then we'll strip it out as an
  148. # attachment that would have to be separately downloaded.
  149. elif part.get_payload() and not part.is_multipart():
  150. payload = part.get_payload(decode=True)
  151. ctype = part.get_content_type()
  152. if not isinstance(ctype, unicode):
  153. ctype.decode("ascii")
  154. # XXX Under email 2.5, it is possible that payload will be None.
  155. # This can happen when you have a Content-Type: multipart/* with
  156. # only one part and that part has two blank lines between the
  157. # first boundary and the end boundary. In email 3.0 you end up
  158. # with a string in the payload. I think in this case it's safe to
  159. # ignore the part.
  160. if payload is None:
  161. continue
  162. attachments.append(self.parse_attachment(part, part_num))
  163. #outer = False
  164. # We still have to sanitize multipart messages to flat text because
  165. # Pipermail can't handle messages with list payloads. This is a kludge;
  166. # def (n) clever hack ;).
  167. if self.msg.is_multipart():
  168. # We now want to concatenate all the parts which have been scrubbed to
  169. # text/plain, into a single text/plain payload. We need to make sure
  170. # all the characters in the concatenated string are in the same
  171. # encoding, so we'll use the 'replace' key in the coercion call.
  172. # BAW: Martin's original patch suggested we might want to try
  173. # generalizing to utf-8, and that's probably a good idea (eventually).
  174. text = []
  175. for part in self.msg.walk():
  176. # TK: bug-id 1099138 and multipart
  177. # MAS test payload - if part may fail if there are no headers.
  178. if not part.get_payload() or part.is_multipart():
  179. continue
  180. # All parts should be scrubbed to text/plain by now, except
  181. # if sanitize == 2, there could be text/html parts so keep them
  182. # but skip any other parts.
  183. partctype = part.get_content_type()
  184. if partctype != 'text/plain' and (partctype != 'text/html' or
  185. sanitize != 2):
  186. #text.append(_('Skipped content of type %(partctype)s\n'))
  187. continue
  188. try:
  189. t = part.get_payload(decode=True) or ''
  190. # MAS: TypeError exception can occur if payload is None. This
  191. # was observed with a message that contained an attached
  192. # message/delivery-status part. Because of the special parsing
  193. # of this type, this resulted in a text/plain sub-part with a
  194. # null body. See bug 1430236.
  195. except (binascii.Error, TypeError):
  196. t = part.get_payload() or ''
  197. partcharset = get_charset(part, guess=True)
  198. try:
  199. t = t.decode(partcharset, 'replace')
  200. except (UnicodeError, LookupError, ValueError,
  201. AssertionError):
  202. # We can get here if partcharset is bogus in some way.
  203. # Replace funny characters. We use errors='replace'
  204. t = t.decode('ascii', 'replace')
  205. # Separation is useful
  206. if isinstance(t, basestring):
  207. if not t.endswith('\n'):
  208. t += '\n'
  209. text.append(t)
  210. text = u"\n".join(text)
  211. else:
  212. text = self.msg.get_payload(decode=True)
  213. charset = get_charset(self.msg, guess=True)
  214. try:
  215. text = text.decode(charset, "replace")
  216. except (UnicodeError, LookupError, ValueError, AssertionError):
  217. text = text.decode('ascii', 'replace')
  218. next_part_match = NEXT_PART.search(text)
  219. if next_part_match:
  220. text = text[0:next_part_match.start(0)]
  221. return (text, attachments)
  222. def parse_attachment(self, part, counter, filter_html=True):
  223. # pylint: disable=unused-argument
  224. # Store name, content-type and size
  225. # Figure out the attachment type and get the decoded data
  226. decodedpayload = part.get_payload(decode=True)
  227. # BAW: mimetypes ought to handle non-standard, but commonly found types,
  228. # e.g. image/jpg (should be image/jpeg). For now we just store such
  229. # things as application/octet-streams since that seems the safest.
  230. ctype = part.get_content_type()
  231. if not isinstance(ctype, unicode):
  232. ctype = ctype.decode("ascii")
  233. charset = get_charset(part, default=None, guess=False)
  234. # i18n file name is encoded
  235. try:
  236. filename = oneline(part.get_filename(''))
  237. except (TypeError, UnicodeDecodeError):
  238. # Workaround for https://bugs.launchpad.net/mailman/+bug/1060951
  239. # (accented filenames)
  240. filename = u"attachment.bin"
  241. filename, fnext = os.path.splitext(filename)
  242. # For safety, we should confirm this is valid ext for content-type
  243. # but we can use fnext if we introduce fnext filtering
  244. # TODO: re-implement this
  245. #if mm_cfg.SCRUBBER_USE_ATTACHMENT_FILENAME_EXTENSION:
  246. # # HTML message doesn't have filename :-(
  247. # ext = fnext or guess_extension(ctype, fnext)
  248. #else:
  249. # ext = guess_extension(ctype, fnext)
  250. ext = fnext or guess_extension(ctype, fnext)
  251. if not ext:
  252. # We don't know what it is, so assume it's just a shapeless
  253. # application/octet-stream, unless the Content-Type: is
  254. # message/rfc822, in which case we know we'll coerce the type to
  255. # text/plain below.
  256. if ctype == 'message/rfc822':
  257. ext = '.txt'
  258. else:
  259. ext = '.bin'
  260. # Allow only alphanumerics, dash, underscore, and dot
  261. ext = sre.sub('', ext)
  262. # Now base the filename on what's in the attachment, uniquifying it if
  263. # necessary.
  264. if not filename:
  265. filebase = u'attachment'
  266. else:
  267. # Sanitize the filename given in the message headers
  268. parts = pre.split(filename)
  269. filename = parts[-1]
  270. # Strip off leading dots
  271. filename = dre.sub('', filename)
  272. # Allow only alphanumerics, dash, underscore, and dot
  273. # i18n filenames are not supported yet,
  274. # see https://bugs.launchpad.net/bugs/1060951
  275. filename = sre.sub('', filename)
  276. # If the filename's extension doesn't match the type we guessed,
  277. # which one should we go with? For now, let's go with the one we
  278. # guessed so attachments can't lie about their type. Also, if the
  279. # filename /has/ no extension, then tack on the one we guessed.
  280. # The extension was removed from the name above.
  281. filebase = filename
  282. # TODO: bring back the HTML sanitizer feature
  283. if ctype == 'message/rfc822':
  284. submsg = part.get_payload()
  285. # Don't HTML-escape it, this is the frontend's job
  286. ## BAW: I'm sure we can eventually do better than this. :(
  287. #decodedpayload = websafe(str(submsg))
  288. decodedpayload = str(submsg)
  289. return (counter, filebase+ext, ctype, charset, decodedpayload)