soup.py | searchcode

/dexy/filters/soup.py

https://gitlab.com/tlevine/dexy · Python · 132 lines · 128 code · 2 blank · 2 comment · 0 complexity · 7e522350d891fec26cc43801d490d35c MD5 · raw file

from bs4 import BeautifulSoup
from dexy.filter import DexyFilter
from dexy.utils import chdir
import base64
import inflection
import mimetypes
import re
import urllib

class Customize(DexyFilter):
    """
    Add <script> tags or <link> tags to an HTML file's header.
    
    Uses BeautifulSoup.
    """
    aliases = ['customize']

    _settings = {
            'scripts' : ("Javascript files to add.", []),
            'stylesheets' : ("CSS files to add.", [])
            }

    def process_text(self, input_text):
        soup = BeautifulSoup(input_text)

        for js in self.setting('scripts'):
            js_tag = soup.new_tag("script", type="text/javascript", src=js)
            soup.head.append(js_tag)

        for css in self.setting('stylesheets'):
            css_tag = soup.new_tag("link", rel="stylesheet", type="text/css", href=css)
            soup.head.append(css_tag)

        return unicode(soup)

class InlineAssets(DexyFilter):
    """
    Imports any referenced images as data URIs.
    """
    aliases = ['inliner']

    _settings = {
            'html-parser' : ("Name of html parser BeautifulSoup should use.", 'html.parser'),
            'inline-images' : ("Whether to inline images using the data uri scheme.", True),
            'inline-styles' : ("Whether to embed referenced CSS in the page header.", True)
            }

    def inline_images(self, soup):
        for tag in soup.find_all("img"):
            path = tag.get('src')

            f = urllib.urlopen(path)
            data = f.read()
            f.close()

            mime, _ = mimetypes.guess_type(path)
            data64 = base64.encodestring(data)
            dataURI = u'data:%s;base64,%s' % (mime, data64)
            tag['src'] = dataURI

    def inline_styles(self, soup):
        for tag in soup.find_all("link"):
            path = tag.get('href')

            f = urllib.urlopen(path)
            data = f.read()
            f.close()

            style = soup.new_tag('style')
            style.string = data

            tag.replace_with(style)

    def process(self):
        soup = BeautifulSoup(unicode(self.input_data), self.setting('html-parser'))
        self.populate_workspace()

        with chdir(self.parent_work_dir()):
            if self.setting('inline-images'):
                self.inline_images(soup)
    
            if self.setting('inline-styles'):
                self.inline_styles(soup)
    
        self.output_data.set_data(unicode(soup))

class SoupSections(DexyFilter):
    """
    Split a HTML file into nested sections based on header tags.
    """
    aliases = ['soups']

    _settings = {
            'data-type' : 'sectioned',
            'html-parser' : ("Name of html parser BeautifulSoup should use.", 'html.parser'),
            'initial-section-name' : ("Name to use for the initial section which currently holds all the contents.", u"Actual Document Contents"),
            }

    def append_current_section(self):
        section_dict = {
                "name" : self.current_section_name,
                "contents" : self.current_section_text,
                "level" : self.current_section_level,
                "id" : self.current_section_anchor
                }
        self.output_data._data.append(section_dict)

    def process(self):
        soup = BeautifulSoup(unicode(self.input_data), self.setting('html-parser'))

        for tag in soup.find_all(re.compile("^h[0-6]")):
            name = tag.text
            m = re.match("^h([0-6])$", tag.name)

            if not tag.attrs.has_key('id'):
                tag.attrs['id'] = inflection.parameterize(name)

            self.current_section_anchor = tag.attrs['id']
            self.current_section_text = None
            self.current_section_name = name
            self.current_section_level = int(m.groups()[0])

            self.append_current_section()

        self.current_section_text = unicode(soup)
        self.current_section_name = self.setting('initial-section-name')
        self.current_section_level = 1
        self.current_section_anchor = None

        self.append_current_section()

        self.output_data.save()
Tech Fingerprint

Alerts (7)

'def' Ensure functions have docstrings for documentation
23 48 61 74 99 108
'open(' Use 'with open()' to ensure Files are properly closed
52