PageRenderTime 74ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/textbookParsing/convert_notebooks_to_html_partial.py

https://gitlab.com/dibya/textbook-tools
Python | 152 lines | 132 code | 8 blank | 12 comment | 5 complexity | 7f4ddf5f77c4b76c06a15b376ff9314e MD5 | raw file
  1. """
  2. This script takes the .ipynb files in the notebooks/ folder and removes the
  3. hidden cells as well as the newlines before closing </div> tags so that the
  4. resulting HTML partial can be embedded in a Gitbook page easily.
  5. For reference:
  6. https://nbconvert.readthedocs.org/en/latest/nbconvert_library.html
  7. http://nbconvert.readthedocs.org/en/latest/nbconvert_library.html#using-different-preprocessors
  8. """
  9. import glob
  10. import re
  11. import os
  12. import bs4
  13. import nbformat
  14. from nbconvert import HTMLExporter
  15. from traitlets.config import Config
  16. preamble = """
  17. <script type="text/x-mathjax-config">
  18. MathJax.Hub.Config({
  19. tex2jax: {
  20. inlineMath: [['$','$']],
  21. processEscapes: true
  22. }
  23. });
  24. </script>
  25. """
  26. # Use ExtractOutputPreprocessor to extract the images to separate files
  27. config = Config()
  28. config.HTMLExporter.preprocessors = [
  29. 'nbconvert.preprocessors.ExtractOutputPreprocessor',
  30. ]
  31. # Output a HTML partial, not a complete page
  32. html_exporter = HTMLExporter(config=config)
  33. html_exporter.template_file = 'template.tpl'
  34. # Output notebook HTML partials into this directory
  35. NOTEBOOK_HTML_DIR = 'notebooks-html'
  36. # Output notebook HTML images into this directory
  37. NOTEBOOK_IMAGE_DIR = 'notebooks-images'
  38. # The prefix for the interact button links. The path format string gets filled
  39. # in with the notebook as well as any datasets the notebook requires.
  40. INTERACT_LINK = 'http://data8.berkeley.edu/hub/interact?repo=textbook&{paths}'
  41. # The prefix for each notebook + its dependencies
  42. PATH_PREFIX = 'path=notebooks/{}'
  43. # The regex used to find file dependencies for notebooks. I could have used
  44. # triple quotes here but it messes up Python syntax highlighting :(
  45. DATASET_REGEX = re.compile(
  46. r"read_table\(" # We look for a line containing read_table(
  47. r"('|\")" # Then either a single or double quote
  48. r"(?P<dataset>" # Start our named match -- dataset
  49. r" (?!https?://)" # Don't match http(s) since those aren't local files
  50. r" \w+.csv\w*" # It has to have .csv in there (might end in .gz)
  51. r")" # Finish our match
  52. r"\1\)" # Make sure the quotes match
  53. , re.VERBOSE)
  54. # Used to ensure all the closing div tags are on the same line for Markdown to
  55. # parse them properly
  56. CLOSING_DIV_REGEX = re.compile('\s+</div>')
  57. import pdb
  58. def convert_notebooks_to_html_partial(notebook_paths):
  59. """
  60. Converts notebooks in notebook_paths to HTML partials in NOTEBOOK_HTML_DIR
  61. """
  62. for notebook_path in notebook_paths:
  63. # Computes <name>.ipynb from notebooks/<name>.ipynb
  64. filename = notebook_path.split('/')[-1]
  65. # Computes <name> from <name>.ipynb
  66. basename = filename.split('.')[0]
  67. # Computes <name>.html from notebooks/<name>.ipynb
  68. outfile_name = basename + '.html'
  69. # This results in images like AB_5_1.png for a notebook called AB.ipynb
  70. unique_image_key = basename
  71. # This sets the img tag URL in the rendered HTML. This restricts the
  72. # the chapter markdown files to be one level deep. It isn't ideal, but
  73. # the only way around it is to buy a domain for the staging textbook as
  74. # well and we'd rather not have to do that.
  75. output_files_dir = '../' + NOTEBOOK_IMAGE_DIR
  76. extract_output_config = {
  77. 'unique_key': unique_image_key,
  78. 'output_files_dir': output_files_dir,
  79. }
  80. notebook = nbformat.read(notebook_path, 4)
  81. raw_html, resources = html_exporter.from_notebook_node(notebook,
  82. resources=extract_output_config)
  83. html = preamble + _extract_cells(raw_html)
  84. # Get dependencies from notebook
  85. matches = list(DATASET_REGEX.finditer(
  86. '\n'.join([cell['source'] for cell in notebook.cells])
  87. ))
  88. dependencies = [match.group('dataset') for match in matches] + \
  89. [filename]
  90. paths = '&'.join([PATH_PREFIX.format(dep) for dep in dependencies])
  91. with_wrapper = """<div id="ipython-notebook">
  92. <a class="interact-button" href="{interact_link}">Interact</a>
  93. {html}
  94. </div>""".format(interact_link=INTERACT_LINK.format(paths=paths),
  95. html=html)
  96. # Remove newlines before closing div tags
  97. final_output = CLOSING_DIV_REGEX.sub('</div>', with_wrapper)
  98. # Write out HTML
  99. outfile_path = os.path.join(os.curdir, NOTEBOOK_HTML_DIR, outfile_name)
  100. with open(outfile_path, 'w') as outfile:
  101. outfile.write(final_output)
  102. # Write out images
  103. for relative_path, image_data in resources['outputs'].items():
  104. image_name = relative_path.split('/')[-1]
  105. final_image_path = '{}/{}'.format(NOTEBOOK_IMAGE_DIR, image_name)
  106. with open(final_image_path, 'wb') as outimage:
  107. outimage.write(image_data)
  108. print(outfile_path + " written.")
  109. def _extract_cells(html):
  110. """Return a html partial of divs with cell contents."""
  111. doc = bs4.BeautifulSoup(html, 'html5lib')
  112. def is_cell(classes):
  113. return classes and ('inner_cell' in classes or 'output_subarea' in classes)
  114. divs = doc.find_all('div', class_=is_cell)
  115. visible = [div for div in divs if '# HIDDEN' not in str(div)]
  116. def remove_empty_spans_and_prompts(tag):
  117. map(lambda t: t.decompose(), tag.find_all('div', class_='prompt'))
  118. map(lambda t: t.decompose(), tag.find_all('span', text='None'))
  119. [remove_empty_spans_and_prompts(div) for div in visible]
  120. return '\n'.join(map(str, visible))
  121. if __name__ == '__main__':
  122. notebook_paths = glob.glob('notebooks/*.ipynb')
  123. convert_notebooks_to_html_partial(notebook_paths)