/dexy/filters/soup.py

https://gitlab.com/tlevine/dexy · Python · 132 lines · 128 code · 2 blank · 2 comment · 0 complexity · 7e522350d891fec26cc43801d490d35c MD5 · raw file

  1. from bs4 import BeautifulSoup
  2. from dexy.filter import DexyFilter
  3. from dexy.utils import chdir
  4. import base64
  5. import inflection
  6. import mimetypes
  7. import re
  8. import urllib
  9. class Customize(DexyFilter):
  10. """
  11. Add <script> tags or <link> tags to an HTML file's header.
  12. Uses BeautifulSoup.
  13. """
  14. aliases = ['customize']
  15. _settings = {
  16. 'scripts' : ("Javascript files to add.", []),
  17. 'stylesheets' : ("CSS files to add.", [])
  18. }
  19. def process_text(self, input_text):
  20. soup = BeautifulSoup(input_text)
  21. for js in self.setting('scripts'):
  22. js_tag = soup.new_tag("script", type="text/javascript", src=js)
  23. soup.head.append(js_tag)
  24. for css in self.setting('stylesheets'):
  25. css_tag = soup.new_tag("link", rel="stylesheet", type="text/css", href=css)
  26. soup.head.append(css_tag)
  27. return unicode(soup)
  28. class InlineAssets(DexyFilter):
  29. """
  30. Imports any referenced images as data URIs.
  31. """
  32. aliases = ['inliner']
  33. _settings = {
  34. 'html-parser' : ("Name of html parser BeautifulSoup should use.", 'html.parser'),
  35. 'inline-images' : ("Whether to inline images using the data uri scheme.", True),
  36. 'inline-styles' : ("Whether to embed referenced CSS in the page header.", True)
  37. }
  38. def inline_images(self, soup):
  39. for tag in soup.find_all("img"):
  40. path = tag.get('src')
  41. f = urllib.urlopen(path)
  42. data = f.read()
  43. f.close()
  44. mime, _ = mimetypes.guess_type(path)
  45. data64 = base64.encodestring(data)
  46. dataURI = u'data:%s;base64,%s' % (mime, data64)
  47. tag['src'] = dataURI
  48. def inline_styles(self, soup):
  49. for tag in soup.find_all("link"):
  50. path = tag.get('href')
  51. f = urllib.urlopen(path)
  52. data = f.read()
  53. f.close()
  54. style = soup.new_tag('style')
  55. style.string = data
  56. tag.replace_with(style)
  57. def process(self):
  58. soup = BeautifulSoup(unicode(self.input_data), self.setting('html-parser'))
  59. self.populate_workspace()
  60. with chdir(self.parent_work_dir()):
  61. if self.setting('inline-images'):
  62. self.inline_images(soup)
  63. if self.setting('inline-styles'):
  64. self.inline_styles(soup)
  65. self.output_data.set_data(unicode(soup))
  66. class SoupSections(DexyFilter):
  67. """
  68. Split a HTML file into nested sections based on header tags.
  69. """
  70. aliases = ['soups']
  71. _settings = {
  72. 'data-type' : 'sectioned',
  73. 'html-parser' : ("Name of html parser BeautifulSoup should use.", 'html.parser'),
  74. 'initial-section-name' : ("Name to use for the initial section which currently holds all the contents.", u"Actual Document Contents"),
  75. }
  76. def append_current_section(self):
  77. section_dict = {
  78. "name" : self.current_section_name,
  79. "contents" : self.current_section_text,
  80. "level" : self.current_section_level,
  81. "id" : self.current_section_anchor
  82. }
  83. self.output_data._data.append(section_dict)
  84. def process(self):
  85. soup = BeautifulSoup(unicode(self.input_data), self.setting('html-parser'))
  86. for tag in soup.find_all(re.compile("^h[0-6]")):
  87. name = tag.text
  88. m = re.match("^h([0-6])$", tag.name)
  89. if not tag.attrs.has_key('id'):
  90. tag.attrs['id'] = inflection.parameterize(name)
  91. self.current_section_anchor = tag.attrs['id']
  92. self.current_section_text = None
  93. self.current_section_name = name
  94. self.current_section_level = int(m.groups()[0])
  95. self.append_current_section()
  96. self.current_section_text = unicode(soup)
  97. self.current_section_name = self.setting('initial-section-name')
  98. self.current_section_level = 1
  99. self.current_section_anchor = None
  100. self.append_current_section()
  101. self.output_data.save()