convert_notebooks_to_html_partial.py - Use ExtractOutputPre…

/textbookParsing/convert_notebooks_to_html_partial.py

https://gitlab.com/dibya/textbook-tools · Python · 152 lines · 88 code · 28 blank · 36 comment · 14 complexity · 7f4ddf5f77c4b76c06a15b376ff9314e MD5 · raw file

"""
This script takes the .ipynb files in the notebooks/ folder and removes the
hidden cells as well as the newlines before closing </div> tags so that the
resulting HTML partial can be embedded in a Gitbook page easily.

For reference:
https://nbconvert.readthedocs.org/en/latest/nbconvert_library.html
http://nbconvert.readthedocs.org/en/latest/nbconvert_library.html#using-different-preprocessors
"""

import glob
import re
import os

import bs4
import nbformat
from nbconvert import HTMLExporter
from traitlets.config import Config

preamble = """
<script type="text/x-mathjax-config">
  MathJax.Hub.Config({
    tex2jax: {
      inlineMath: [['$','$']],
      processEscapes: true
    }
  });
</script>
"""

# Use ExtractOutputPreprocessor to extract the images to separate files
config = Config()
config.HTMLExporter.preprocessors = [
    'nbconvert.preprocessors.ExtractOutputPreprocessor',
]

# Output a HTML partial, not a complete page
html_exporter = HTMLExporter(config=config)
html_exporter.template_file = 'template.tpl'

# Output notebook HTML partials into this directory
NOTEBOOK_HTML_DIR = 'notebooks-html'

# Output notebook HTML images into this directory
NOTEBOOK_IMAGE_DIR = 'notebooks-images'

# The prefix for the interact button links. The path format string gets filled
# in with the notebook as well as any datasets the notebook requires.
INTERACT_LINK = 'http://data8.berkeley.edu/hub/interact?repo=textbook&{paths}'

# The prefix for each notebook + its dependencies
PATH_PREFIX = 'path=notebooks/{}'

# The regex used to find file dependencies for notebooks. I could have used
# triple quotes here but it messes up Python syntax highlighting :(
DATASET_REGEX = re.compile(
    r"read_table\("        # We look for a line containing read_table(
    r"('|\")"              # Then either a single or double quote
    r"(?P<dataset>"        # Start our named match -- dataset
    r"    (?!https?://)"   # Don't match http(s) since those aren't local files
    r"    \w+.csv\w*"      # It has to have .csv in there (might end in .gz)
    r")"                   # Finish our match
    r"\1\)"                # Make sure the quotes match
, re.VERBOSE)

# Used to ensure all the closing div tags are on the same line for Markdown to
# parse them properly
CLOSING_DIV_REGEX = re.compile('\s+</div>')

import pdb

def convert_notebooks_to_html_partial(notebook_paths):
    """
    Converts notebooks in notebook_paths to HTML partials in NOTEBOOK_HTML_DIR
    """
    for notebook_path in notebook_paths:
        # Computes <name>.ipynb from notebooks/<name>.ipynb
        filename = notebook_path.split('/')[-1]
        # Computes <name> from <name>.ipynb
        basename = filename.split('.')[0]
        # Computes <name>.html from notebooks/<name>.ipynb
        outfile_name = basename + '.html'

        # This results in images like AB_5_1.png for a notebook called AB.ipynb
        unique_image_key = basename
        # This sets the img tag URL in the rendered HTML. This restricts the
        # the chapter markdown files to be one level deep. It isn't ideal, but
        # the only way around it is to buy a domain for the staging textbook as
        # well and we'd rather not have to do that.
        output_files_dir = '../' + NOTEBOOK_IMAGE_DIR

        extract_output_config = {
            'unique_key': unique_image_key,
            'output_files_dir': output_files_dir,
        }

        notebook = nbformat.read(notebook_path, 4)
        raw_html, resources = html_exporter.from_notebook_node(notebook,
            resources=extract_output_config)

        html = preamble + _extract_cells(raw_html)

        # Get dependencies from notebook
        matches = list(DATASET_REGEX.finditer(
            '\n'.join([cell['source'] for cell in notebook.cells])
        ))
        dependencies = [match.group('dataset') for match in matches] + \
                       [filename]
        paths = '&'.join([PATH_PREFIX.format(dep) for dep in dependencies])

        with_wrapper = """<div id="ipython-notebook">
            <a class="interact-button" href="{interact_link}">Interact</a>
            {html}
        </div>""".format(interact_link=INTERACT_LINK.format(paths=paths),
                         html=html)

        # Remove newlines before closing div tags
        final_output = CLOSING_DIV_REGEX.sub('</div>', with_wrapper)

        # Write out HTML
        outfile_path = os.path.join(os.curdir, NOTEBOOK_HTML_DIR, outfile_name)
        with open(outfile_path, 'w') as outfile:
            outfile.write(final_output)

        # Write out images
        for relative_path, image_data in resources['outputs'].items():
            image_name = relative_path.split('/')[-1]
            final_image_path = '{}/{}'.format(NOTEBOOK_IMAGE_DIR, image_name)
            with open(final_image_path, 'wb') as outimage:
                outimage.write(image_data)
        print(outfile_path + " written.")

def _extract_cells(html):
    """Return a html partial of divs with cell contents."""
    doc = bs4.BeautifulSoup(html, 'html5lib')

    def is_cell(classes):
        return classes and ('inner_cell' in classes or 'output_subarea' in classes)

    divs = doc.find_all('div', class_=is_cell)
    visible = [div for div in divs if '# HIDDEN' not in str(div)]

    def remove_empty_spans_and_prompts(tag):
        map(lambda t: t.decompose(), tag.find_all('div', class_='prompt'))
        map(lambda t: t.decompose(), tag.find_all('span', text='None'))
    [remove_empty_spans_and_prompts(div) for div in visible]

    return '\n'.join(map(str, visible))

if __name__ == '__main__':
    notebook_paths = glob.glob('notebooks/*.ipynb')
    convert_notebooks_to_html_partial(notebook_paths)
Tech Fingerprint

Alerts (4)

'list(' Avoid unnecessary list conversions; use generators where possible
104
'print(' Use logging module for better control and configurability
131
'def' Ensure functions have docstrings for documentation
137 143