rfc822.py | searchcode

/server/superdesk/io/rfc822.py

https://gitlab.com/wilane/superdesk
Python | 264 lines | 201 code | 34 blank | 29 comment | 45 complexity | e2284e196625c738b5308fd7d8963923 MD5 | raw file


# -*- coding: utf-8; -*-
#
# This file is part of Superdesk.
#
# Copyright 2013, 2014 Sourcefabric z.u. and contributors.
#
# For the full copyright and license information, please see the
# AUTHORS and LICENSE files distributed with this source code, or
# at https://www.sourcefabric.org/superdesk/license*.

from superdesk.io import Parser
import datetime
from superdesk.utc import utcnow
from pytz import timezone
from superdesk.media.media_operations import process_file_from_stream
from apps.archive.archive_media import generate_guid, GUID_TAG
import io
from flask import current_app as app
import email
from email.header import decode_header
import logging
from superdesk.errors import IngestEmailError
from bs4 import BeautifulSoup, Comment, Doctype
import re


logger = logging.getLogger(__name__)


class rfc822Parser(Parser):

    def __init__(self):
        self.parser_app = app

    def parse_email(self, data, provider):
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item['type'] = 'text'
            item['versioncreated'] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item['headline'] = self.parse_header(msg['subject'])
                    item['original_creator'] = self.parse_header(msg['from'])
                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}".format(item['headline'],
                                                                                          item['original_creator']), ex)
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = self.safe_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text html for {0} from {1}".format(item['headline'],
                                                                                          item['original_creator']), ex)
                                continue
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != 'image':
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == 'image/gif' or content_type == 'image/png':
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(content, filename=fileName,
                                                                 content_type=content_type, metadata=metadata)
                            renditions = {'baseImage': {'href': image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item['type'] = 'composite'
                                comp_item['guid'] = generate_guid(type=GUID_TAG)
                                comp_item['versioncreated'] = utcnow()
                                comp_item['groups'] = []
                                comp_item['headline'] = item['headline']
                                comp_item['groups'] = []

                                # create a reference to the item that stores the body of the email
                                item_ref = {}
                                item_ref['guid'] = item['guid']
                                item_ref['residRef'] = item['guid']
                                item_ref['headline'] = item['headline']
                                item_ref['location'] = 'ingest'
                                item_ref['itemClass'] = 'icls:text'
                                refs.append(item_ref)

                            media_item = dict()
                            media_item['guid'] = generate_guid(type=GUID_TAG)
                            media_item['versioncreated'] = utcnow()
                            media_item['type'] = 'picture'
                            media_item['renditions'] = renditions
                            media_item['mimetype'] = content_type
                            media_item['filemeta'] = metadata
                            media_item['slugline'] = fileName
                            if text_body is not None:
                                media_item['body_html'] = text_body
                            media_item['headline'] = item['headline']
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {}
                            media_ref['guid'] = media_item['guid']
                            media_ref['residRef'] = media_item['guid']
                            media_ref['headline'] = fileName
                            media_ref['location'] = 'ingest'
                            media_ref['itemClass'] = 'icls:picture'
                            refs.append(media_ref)

            if html_body is not None:
                item['body_html'] = html_body
            else:
                item['body_html'] = text_body
                item['type'] = 'preformatted'

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {}
                grefs['refs'] = [{'idRef': 'main'}]
                grefs['id'] = 'root'
                grefs['role'] = 'grpRole:NEP'
                comp_item['groups'].append(grefs)

                grefs = {}
                grefs['refs'] = refs
                grefs['id'] = 'main'
                grefs['role'] = 'grpRole:Main'
                comp_item['groups'].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)

    def parse_header(self, field):
        try:
            hdr = decode_header(field)
            encoding = hdr[0][1]
            if encoding and hdr:
                parsed_field = hdr[0][0].decode(encoding)
            else:
                parsed_field = hdr[0][0]
        except:
            try:
                parsed_field = str(field)
            except:
                parsed_field = 'Unknown'
            pass
        return parsed_field

    # from http://chase-seibert.github.io/blog/2011/01/28/sanitize-html-with-beautiful-soup.html
    def safe_html(self, html):
        if not html:
            return None

        # remove these tags, complete with contents.
        blacklist = ["script", "style", "head"]

        whitelist = [
            "div", "span", "p", "br", "pre",
            "table", "tbody", "thead", "tr", "td", "a",
            "blockquote",
            "ul", "li", "ol",
            "b", "em", "i", "strong", "u", "font"
        ]

        try:
            # BeautifulSoup is catching out-of-order and unclosed tags, so markup
            # can't leak out of comments and break the rest of the page.
            soup = BeautifulSoup(html)
        except Exception as e:
            # special handling?
            raise e

        # remove the doctype declaration if present
        if isinstance(soup.contents[0], Doctype):
            soup.contents[0].extract()

        # now strip HTML we don't like.
        for tag in soup.findAll():
            if tag.name.lower() in blacklist:
                # blacklisted tags are removed in their entirety
                tag.extract()
            elif tag.name.lower() in whitelist:
                # tag is allowed. Make sure the attributes are allowed.
                attrs = dict(tag.attrs)
                for a in attrs:
                    if self._attr_name_whitelisted(a):
                        tag.attrs[a] = [self.safe_css(a, tag.attrs[a])]
                    else:
                        del tag.attrs[a]
            else:
                tag.replaceWithChildren()

        # scripts can be executed from comments in some cases
        comments = soup.findAll(text=lambda text: isinstance(text, Comment))
        for comment in comments:
            comment.extract()

        safe_html = str(soup)

        if safe_html == ", -":
            return None

        return safe_html.replace('</br>', '').replace('<br>', '<br/>')

    def _attr_name_whitelisted(self, attr_name):
        return attr_name.lower() in ["href", "style", "color", "size", "bgcolor", "border"]

    def safe_css(self, attr, css):
        if attr == "style":
            return re.sub("(width|height):[^;]+;", "", css)
        return css