PageRenderTime 33ms CodeModel.GetById 11ms app.highlight 17ms RepoModel.GetById 1ms app.codeStats 1ms

/feincms/utils/html/cleanse.py

http://github.com/feincms/feincms
Python | 158 lines | 105 code | 27 blank | 26 comment | 24 complexity | 032cd11f0b571b69d9c531e4b017241f MD5 | raw file
  1import lxml.html
  2import lxml.html.clean
  3import re
  4
  5
  6cleanse_html_allowed = {
  7    'a': ('href', 'name', 'target', 'title'),
  8    'h2': (),
  9    'h3': (),
 10    'strong': (),
 11    'em': (),
 12    'p': (),
 13    'ul': (),
 14    'ol': (),
 15    'li': (),
 16    'span': (),
 17    'br': (),
 18    'sub': (),
 19    'sup': (),
 20    'anything': (),
 21    }
 22
 23cleanse_html_allowed_empty_tags = ('br',)
 24
 25cleanse_html_merge = ('h2', 'h3', 'strong', 'em', 'ul', 'ol', 'sub', 'sup')
 26
 27
 28def cleanse_html(html):
 29    """
 30    Clean HTML code from ugly copy-pasted CSS and empty elements
 31
 32    Removes everything not explicitly allowed in ``cleanse_html_allowed``.
 33
 34    Requires ``lxml`` and ``beautifulsoup``.
 35    """
 36
 37    doc = lxml.html.fromstring('<anything>%s</anything>' % html)
 38    try:
 39        ignore = lxml.html.tostring(doc, encoding=unicode)
 40    except UnicodeDecodeError:
 41        # fall back to slower BeautifulSoup if parsing failed
 42        from lxml.html import soupparser
 43        doc = soupparser.fromstring(u'<anything>%s</anything>' % html)
 44
 45    cleaner = lxml.html.clean.Cleaner(
 46        allow_tags=cleanse_html_allowed.keys() + ['style'],
 47        remove_unknown_tags=False, # preserve surrounding 'anything' tag
 48        style=False, safe_attrs_only=False, # do not strip out style
 49                                            # attributes; we still need
 50                                            # the style information to
 51                                            # convert spans into em/strong
 52                                            # tags
 53        )
 54
 55    cleaner(doc)
 56
 57    # walk the tree recursively, because we want to be able to remove
 58    # previously emptied elements completely
 59    for element in reversed(list(doc.iterdescendants())):
 60        if element.tag == 'style':
 61            element.drop_tree()
 62            continue
 63
 64        # convert span elements into em/strong if a matching style rule
 65        # has been found. strong has precedence, strong & em at the same
 66        # time is not supported
 67        elif element.tag == 'span':
 68            style = element.attrib.get('style')
 69            if style:
 70                if 'bold' in style:
 71                    element.tag = 'strong'
 72                elif 'italic' in style:
 73                    element.tag = 'em'
 74
 75            if element.tag == 'span': # still span
 76                element.drop_tag() # remove tag, but preserve children and text
 77                continue
 78
 79        # remove empty tags if they are not <br />
 80        elif not element.text and element.tag not in \
 81                cleanse_html_allowed_empty_tags and not \
 82                len(list(element.iterdescendants())):
 83            element.drop_tag()
 84            continue
 85
 86        # remove all attributes which are not explicitly allowed
 87        allowed = cleanse_html_allowed.get(element.tag, [])
 88        for key in element.attrib.keys():
 89            if key not in allowed:
 90                del element.attrib[key]
 91
 92    # just to be sure, run cleaner again, but this time with even more
 93    # strict settings
 94    cleaner = lxml.html.clean.Cleaner(
 95        allow_tags=cleanse_html_allowed.keys(),
 96        remove_unknown_tags=False, # preserve surrounding 'anything' tag
 97        style=True, safe_attrs_only=True
 98        )
 99
100    cleaner(doc)
101
102    html = lxml.html.tostring(doc, method='xml')
103
104    # remove all sorts of newline characters
105    html = html.replace('\n', ' ').replace('\r', ' ')
106    html = html.replace('&#10;', ' ').replace('&#13;', ' ')
107    html = html.replace('&#xa;', ' ').replace('&#xd;', ' ')
108
109    # remove wrapping tag needed by XML parser
110    html = re.sub(r'</?anything>', '', html)
111
112    # remove elements containing only whitespace or linebreaks
113    whitespace_re = re.compile(r'<([a-z0-9]+)>(<br\s*/>|\&nbsp;|\&#160;|\s)*</\1>')
114    while True:
115        new = whitespace_re.sub('', html)
116        if new == html:
117            break
118        html = new
119
120    # merge tags
121    for tag in cleanse_html_merge:
122        merge_str = u'</%s><%s>'
123        while True:
124            new = html.replace(merge_str, u'')
125            if new == html:
126                break
127            html = new
128
129    # fix p-in-p tags
130    p_in_p_start_re = re.compile(r'<p>(\&nbsp;|\&#160;|\s)*<p>')
131    p_in_p_end_re = re.compile('</p>(\&nbsp;|\&#160;|\s)*</p>')
132
133    for tag in cleanse_html_merge:
134        merge_start_re = re.compile('<p>(\\&nbsp;|\\&#160;|\\s)*<%s>(\\&nbsp;|\\&#160;|\\s)*<p>' % tag)
135        merge_end_re = re.compile('</p>(\\&nbsp;|\\&#160;|\\s)*</%s>(\\&nbsp;|\\&#160;|\\s)*</p>' % tag)
136
137        while True:
138            new = merge_start_re.sub('<p>', html)
139            new = merge_end_re.sub('</p>', new)
140            new = p_in_p_start_re.sub('<p>', new)
141            new = p_in_p_end_re.sub('</p>', new)
142
143            if new == html:
144                break
145            html = new
146
147    # remove list markers with <li> tags before them
148    html = re.sub(r'<li>(\&nbsp;|\&#160;|\s)*(-|\*|&#183;)(\&nbsp;|\&#160;|\s)*', '<li>', html)
149
150    # remove p-in-li tags
151    html = re.sub(r'<li>(\&nbsp;|\&#160;|\s)*<p>', '<li>', html)
152    html = re.sub(r'</p>(\&nbsp;|\&#160;|\s)*</li>', '</li>', html)
153
154    # add a space before the closing slash in empty tags
155    html = re.sub(r'<([^/>]+)/>', r'<\1 />', html)
156
157    return html
158