PageRenderTime 55ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 0ms

/server/superdesk/io/rfc822.py

https://gitlab.com/wilane/superdesk
Python | 264 lines | 201 code | 34 blank | 29 comment | 45 complexity | e2284e196625c738b5308fd7d8963923 MD5 | raw file
  1. # -*- coding: utf-8; -*-
  2. #
  3. # This file is part of Superdesk.
  4. #
  5. # Copyright 2013, 2014 Sourcefabric z.u. and contributors.
  6. #
  7. # For the full copyright and license information, please see the
  8. # AUTHORS and LICENSE files distributed with this source code, or
  9. # at https://www.sourcefabric.org/superdesk/license*.
  10. from superdesk.io import Parser
  11. import datetime
  12. from superdesk.utc import utcnow
  13. from pytz import timezone
  14. from superdesk.media.media_operations import process_file_from_stream
  15. from apps.archive.archive_media import generate_guid, GUID_TAG
  16. import io
  17. from flask import current_app as app
  18. import email
  19. from email.header import decode_header
  20. import logging
  21. from superdesk.errors import IngestEmailError
  22. from bs4 import BeautifulSoup, Comment, Doctype
  23. import re
  24. logger = logging.getLogger(__name__)
  25. class rfc822Parser(Parser):
  26. def __init__(self):
  27. self.parser_app = app
  28. def parse_email(self, data, provider):
  29. try:
  30. new_items = []
  31. # create an item for the body text of the email
  32. # either text or html
  33. item = dict()
  34. item['type'] = 'text'
  35. item['versioncreated'] = utcnow()
  36. comp_item = None
  37. # a list to keep the references to the attachments
  38. refs = []
  39. html_body = None
  40. text_body = None
  41. for response_part in data:
  42. if isinstance(response_part, tuple):
  43. msg = email.message_from_bytes(response_part[1])
  44. item['headline'] = self.parse_header(msg['subject'])
  45. item['original_creator'] = self.parse_header(msg['from'])
  46. item['guid'] = msg['Message-ID']
  47. date_tuple = email.utils.parsedate_tz(msg['Date'])
  48. if date_tuple:
  49. dt = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(date_tuple))
  50. dt = dt.replace(tzinfo=timezone('utc'))
  51. item['firstcreated'] = dt
  52. # this will loop through all the available multiparts in mail
  53. for part in msg.walk():
  54. if part.get_content_type() == "text/plain":
  55. body = part.get_payload(decode=True)
  56. try:
  57. # if we don't know the charset just have a go!
  58. if part.get_content_charset() is None:
  59. text_body = body.decode()
  60. else:
  61. charset = part.get_content_charset()
  62. text_body = body.decode(charset)
  63. continue
  64. except Exception as ex:
  65. logger.exception(
  66. "Exception parsing text body for {0} from {1}".format(item['headline'],
  67. item['original_creator']), ex)
  68. continue
  69. if part.get_content_type() == "text/html":
  70. body = part.get_payload(decode=True)
  71. try:
  72. if part.get_content_charset() is None:
  73. html_body = body.decode()
  74. else:
  75. charset = part.get_content_charset()
  76. html_body = body.decode(charset)
  77. html_body = self.safe_html(html_body)
  78. continue
  79. except Exception as ex:
  80. logger.exception(
  81. "Exception parsing text html for {0} from {1}".format(item['headline'],
  82. item['original_creator']), ex)
  83. continue
  84. if part.get_content_maintype() == 'multipart':
  85. continue
  86. if part.get('Content-Disposition') is None:
  87. continue
  88. # we are only going to pull off image attachments at this stage
  89. if part.get_content_maintype() != 'image':
  90. continue
  91. fileName = part.get_filename()
  92. if bool(fileName):
  93. image = part.get_payload(decode=True)
  94. content = io.BytesIO(image)
  95. res = process_file_from_stream(content, part.get_content_type())
  96. file_name, content_type, metadata = res
  97. if content_type == 'image/gif' or content_type == 'image/png':
  98. continue
  99. content.seek(0)
  100. image_id = self.parser_app.media.put(content, filename=fileName,
  101. content_type=content_type, metadata=metadata)
  102. renditions = {'baseImage': {'href': image_id}}
  103. # if we have not got a composite item then create one
  104. if not comp_item:
  105. comp_item = dict()
  106. comp_item['type'] = 'composite'
  107. comp_item['guid'] = generate_guid(type=GUID_TAG)
  108. comp_item['versioncreated'] = utcnow()
  109. comp_item['groups'] = []
  110. comp_item['headline'] = item['headline']
  111. comp_item['groups'] = []
  112. # create a reference to the item that stores the body of the email
  113. item_ref = {}
  114. item_ref['guid'] = item['guid']
  115. item_ref['residRef'] = item['guid']
  116. item_ref['headline'] = item['headline']
  117. item_ref['location'] = 'ingest'
  118. item_ref['itemClass'] = 'icls:text'
  119. refs.append(item_ref)
  120. media_item = dict()
  121. media_item['guid'] = generate_guid(type=GUID_TAG)
  122. media_item['versioncreated'] = utcnow()
  123. media_item['type'] = 'picture'
  124. media_item['renditions'] = renditions
  125. media_item['mimetype'] = content_type
  126. media_item['filemeta'] = metadata
  127. media_item['slugline'] = fileName
  128. if text_body is not None:
  129. media_item['body_html'] = text_body
  130. media_item['headline'] = item['headline']
  131. new_items.append(media_item)
  132. # add a reference to this item in the composite item
  133. media_ref = {}
  134. media_ref['guid'] = media_item['guid']
  135. media_ref['residRef'] = media_item['guid']
  136. media_ref['headline'] = fileName
  137. media_ref['location'] = 'ingest'
  138. media_ref['itemClass'] = 'icls:picture'
  139. refs.append(media_ref)
  140. if html_body is not None:
  141. item['body_html'] = html_body
  142. else:
  143. item['body_html'] = text_body
  144. item['type'] = 'preformatted'
  145. # if there is composite item then add the main group and references
  146. if comp_item:
  147. grefs = {}
  148. grefs['refs'] = [{'idRef': 'main'}]
  149. grefs['id'] = 'root'
  150. grefs['role'] = 'grpRole:NEP'
  151. comp_item['groups'].append(grefs)
  152. grefs = {}
  153. grefs['refs'] = refs
  154. grefs['id'] = 'main'
  155. grefs['role'] = 'grpRole:Main'
  156. comp_item['groups'].append(grefs)
  157. new_items.append(comp_item)
  158. new_items.append(item)
  159. return new_items
  160. except Exception as ex:
  161. raise IngestEmailError.emailParseError(ex, provider)
  162. def parse_header(self, field):
  163. try:
  164. hdr = decode_header(field)
  165. encoding = hdr[0][1]
  166. if encoding and hdr:
  167. parsed_field = hdr[0][0].decode(encoding)
  168. else:
  169. parsed_field = hdr[0][0]
  170. except:
  171. try:
  172. parsed_field = str(field)
  173. except:
  174. parsed_field = 'Unknown'
  175. pass
  176. return parsed_field
  177. # from http://chase-seibert.github.io/blog/2011/01/28/sanitize-html-with-beautiful-soup.html
  178. def safe_html(self, html):
  179. if not html:
  180. return None
  181. # remove these tags, complete with contents.
  182. blacklist = ["script", "style", "head"]
  183. whitelist = [
  184. "div", "span", "p", "br", "pre",
  185. "table", "tbody", "thead", "tr", "td", "a",
  186. "blockquote",
  187. "ul", "li", "ol",
  188. "b", "em", "i", "strong", "u", "font"
  189. ]
  190. try:
  191. # BeautifulSoup is catching out-of-order and unclosed tags, so markup
  192. # can't leak out of comments and break the rest of the page.
  193. soup = BeautifulSoup(html)
  194. except Exception as e:
  195. # special handling?
  196. raise e
  197. # remove the doctype declaration if present
  198. if isinstance(soup.contents[0], Doctype):
  199. soup.contents[0].extract()
  200. # now strip HTML we don't like.
  201. for tag in soup.findAll():
  202. if tag.name.lower() in blacklist:
  203. # blacklisted tags are removed in their entirety
  204. tag.extract()
  205. elif tag.name.lower() in whitelist:
  206. # tag is allowed. Make sure the attributes are allowed.
  207. attrs = dict(tag.attrs)
  208. for a in attrs:
  209. if self._attr_name_whitelisted(a):
  210. tag.attrs[a] = [self.safe_css(a, tag.attrs[a])]
  211. else:
  212. del tag.attrs[a]
  213. else:
  214. tag.replaceWithChildren()
  215. # scripts can be executed from comments in some cases
  216. comments = soup.findAll(text=lambda text: isinstance(text, Comment))
  217. for comment in comments:
  218. comment.extract()
  219. safe_html = str(soup)
  220. if safe_html == ", -":
  221. return None
  222. return safe_html.replace('</br>', '').replace('<br>', '<br/>')
  223. def _attr_name_whitelisted(self, attr_name):
  224. return attr_name.lower() in ["href", "style", "color", "size", "bgcolor", "border"]
  225. def safe_css(self, attr, css):
  226. if attr == "style":
  227. return re.sub("(width|height):[^;]+;", "", css)
  228. return css