/feincms/utils/html/cleanse.py

http://github.com/feincms/feincms · Python · 158 lines · 99 code · 30 blank · 29 comment · 22 complexity · 032cd11f0b571b69d9c531e4b017241f MD5 · raw file

  1. import lxml.html
  2. import lxml.html.clean
  3. import re
  4. cleanse_html_allowed = {
  5. 'a': ('href', 'name', 'target', 'title'),
  6. 'h2': (),
  7. 'h3': (),
  8. 'strong': (),
  9. 'em': (),
  10. 'p': (),
  11. 'ul': (),
  12. 'ol': (),
  13. 'li': (),
  14. 'span': (),
  15. 'br': (),
  16. 'sub': (),
  17. 'sup': (),
  18. 'anything': (),
  19. }
  20. cleanse_html_allowed_empty_tags = ('br',)
  21. cleanse_html_merge = ('h2', 'h3', 'strong', 'em', 'ul', 'ol', 'sub', 'sup')
  22. def cleanse_html(html):
  23. """
  24. Clean HTML code from ugly copy-pasted CSS and empty elements
  25. Removes everything not explicitly allowed in ``cleanse_html_allowed``.
  26. Requires ``lxml`` and ``beautifulsoup``.
  27. """
  28. doc = lxml.html.fromstring('<anything>%s</anything>' % html)
  29. try:
  30. ignore = lxml.html.tostring(doc, encoding=unicode)
  31. except UnicodeDecodeError:
  32. # fall back to slower BeautifulSoup if parsing failed
  33. from lxml.html import soupparser
  34. doc = soupparser.fromstring(u'<anything>%s</anything>' % html)
  35. cleaner = lxml.html.clean.Cleaner(
  36. allow_tags=cleanse_html_allowed.keys() + ['style'],
  37. remove_unknown_tags=False, # preserve surrounding 'anything' tag
  38. style=False, safe_attrs_only=False, # do not strip out style
  39. # attributes; we still need
  40. # the style information to
  41. # convert spans into em/strong
  42. # tags
  43. )
  44. cleaner(doc)
  45. # walk the tree recursively, because we want to be able to remove
  46. # previously emptied elements completely
  47. for element in reversed(list(doc.iterdescendants())):
  48. if element.tag == 'style':
  49. element.drop_tree()
  50. continue
  51. # convert span elements into em/strong if a matching style rule
  52. # has been found. strong has precedence, strong & em at the same
  53. # time is not supported
  54. elif element.tag == 'span':
  55. style = element.attrib.get('style')
  56. if style:
  57. if 'bold' in style:
  58. element.tag = 'strong'
  59. elif 'italic' in style:
  60. element.tag = 'em'
  61. if element.tag == 'span': # still span
  62. element.drop_tag() # remove tag, but preserve children and text
  63. continue
  64. # remove empty tags if they are not <br />
  65. elif not element.text and element.tag not in \
  66. cleanse_html_allowed_empty_tags and not \
  67. len(list(element.iterdescendants())):
  68. element.drop_tag()
  69. continue
  70. # remove all attributes which are not explicitly allowed
  71. allowed = cleanse_html_allowed.get(element.tag, [])
  72. for key in element.attrib.keys():
  73. if key not in allowed:
  74. del element.attrib[key]
  75. # just to be sure, run cleaner again, but this time with even more
  76. # strict settings
  77. cleaner = lxml.html.clean.Cleaner(
  78. allow_tags=cleanse_html_allowed.keys(),
  79. remove_unknown_tags=False, # preserve surrounding 'anything' tag
  80. style=True, safe_attrs_only=True
  81. )
  82. cleaner(doc)
  83. html = lxml.html.tostring(doc, method='xml')
  84. # remove all sorts of newline characters
  85. html = html.replace('\n', ' ').replace('\r', ' ')
  86. html = html.replace('&#10;', ' ').replace('&#13;', ' ')
  87. html = html.replace('&#xa;', ' ').replace('&#xd;', ' ')
  88. # remove wrapping tag needed by XML parser
  89. html = re.sub(r'</?anything>', '', html)
  90. # remove elements containing only whitespace or linebreaks
  91. whitespace_re = re.compile(r'<([a-z0-9]+)>(<br\s*/>|\&nbsp;|\&#160;|\s)*</\1>')
  92. while True:
  93. new = whitespace_re.sub('', html)
  94. if new == html:
  95. break
  96. html = new
  97. # merge tags
  98. for tag in cleanse_html_merge:
  99. merge_str = u'</%s><%s>'
  100. while True:
  101. new = html.replace(merge_str, u'')
  102. if new == html:
  103. break
  104. html = new
  105. # fix p-in-p tags
  106. p_in_p_start_re = re.compile(r'<p>(\&nbsp;|\&#160;|\s)*<p>')
  107. p_in_p_end_re = re.compile('</p>(\&nbsp;|\&#160;|\s)*</p>')
  108. for tag in cleanse_html_merge:
  109. merge_start_re = re.compile('<p>(\\&nbsp;|\\&#160;|\\s)*<%s>(\\&nbsp;|\\&#160;|\\s)*<p>' % tag)
  110. merge_end_re = re.compile('</p>(\\&nbsp;|\\&#160;|\\s)*</%s>(\\&nbsp;|\\&#160;|\\s)*</p>' % tag)
  111. while True:
  112. new = merge_start_re.sub('<p>', html)
  113. new = merge_end_re.sub('</p>', new)
  114. new = p_in_p_start_re.sub('<p>', new)
  115. new = p_in_p_end_re.sub('</p>', new)
  116. if new == html:
  117. break
  118. html = new
  119. # remove list markers with <li> tags before them
  120. html = re.sub(r'<li>(\&nbsp;|\&#160;|\s)*(-|\*|&#183;)(\&nbsp;|\&#160;|\s)*', '<li>', html)
  121. # remove p-in-li tags
  122. html = re.sub(r'<li>(\&nbsp;|\&#160;|\s)*<p>', '<li>', html)
  123. html = re.sub(r'</p>(\&nbsp;|\&#160;|\s)*</li>', '</li>', html)
  124. # add a space before the closing slash in empty tags
  125. html = re.sub(r'<([^/>]+)/>', r'<\1 />', html)
  126. return html