PageRenderTime 64ms CodeModel.GetById 15ms app.highlight 43ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/galaxy/util/sanitize_html.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 441 lines | 433 code | 4 blank | 4 comment | 5 complexity | d4ded7b5e467eff2b15d0a26066f5bac MD5 | raw file
  1"""
  2HTML Sanitizer (ripped from feedparser)
  3"""
  4
  5import re, sgmllib
  6
  7# reversable htmlentitydefs mappings for Python 2.2
  8try:
  9    from htmlentitydefs import name2codepoint, codepoint2name
 10except:
 11    import htmlentitydefs
 12    name2codepoint={}
 13    codepoint2name={}
 14    for (name,codepoint) in htmlentitydefs.entitydefs.iteritems():
 15      if codepoint.startswith('&#'): codepoint=unichr(int(codepoint[2:-1]))
 16      name2codepoint[name]=ord(codepoint)
 17      codepoint2name[ord(codepoint)]=name
 18
 19_cp1252 = {
 20  unichr(128): unichr(8364), # euro sign
 21  unichr(130): unichr(8218), # single low-9 quotation mark
 22  unichr(131): unichr( 402), # latin small letter f with hook
 23  unichr(132): unichr(8222), # double low-9 quotation mark
 24  unichr(133): unichr(8230), # horizontal ellipsis
 25  unichr(134): unichr(8224), # dagger
 26  unichr(135): unichr(8225), # double dagger
 27  unichr(136): unichr( 710), # modifier letter circumflex accent
 28  unichr(137): unichr(8240), # per mille sign
 29  unichr(138): unichr( 352), # latin capital letter s with caron
 30  unichr(139): unichr(8249), # single left-pointing angle quotation mark
 31  unichr(140): unichr( 338), # latin capital ligature oe
 32  unichr(142): unichr( 381), # latin capital letter z with caron
 33  unichr(145): unichr(8216), # left single quotation mark
 34  unichr(146): unichr(8217), # right single quotation mark
 35  unichr(147): unichr(8220), # left double quotation mark
 36  unichr(148): unichr(8221), # right double quotation mark
 37  unichr(149): unichr(8226), # bullet
 38  unichr(150): unichr(8211), # en dash
 39  unichr(151): unichr(8212), # em dash
 40  unichr(152): unichr( 732), # small tilde
 41  unichr(153): unichr(8482), # trade mark sign
 42  unichr(154): unichr( 353), # latin small letter s with caron
 43  unichr(155): unichr(8250), # single right-pointing angle quotation mark
 44  unichr(156): unichr( 339), # latin small ligature oe
 45  unichr(158): unichr( 382), # latin small letter z with caron
 46  unichr(159): unichr( 376)} # latin capital letter y with diaeresis
 47
 48class _BaseHTMLProcessor(sgmllib.SGMLParser):
 49    special = re.compile('''[<>'"]''')
 50    bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
 51    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
 52      'img', 'input', 'isindex', 'link', 'meta', 'param']
 53
 54    def __init__(self, encoding, type):
 55        self.encoding = encoding
 56        self.type = type
 57        ## if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
 58        sgmllib.SGMLParser.__init__(self)
 59
 60    def reset(self):
 61        self.pieces = []
 62        sgmllib.SGMLParser.reset(self)
 63
 64    def _shorttag_replace(self, match):
 65        tag = match.group(1)
 66        if tag in self.elements_no_end_tag:
 67            return '<' + tag + ' />'
 68        else:
 69            return '<' + tag + '></' + tag + '>'
 70
 71    def parse_starttag(self,i):
 72        j=sgmllib.SGMLParser.parse_starttag(self, i)
 73        if self.type == 'application/xhtml+xml':
 74            if j>2 and self.rawdata[j-2:j]=='/>':
 75                self.unknown_endtag(self.lasttag)
 76        return j
 77
 78    def feed(self, data):
 79        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
 80        #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
 81        data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
 82        data = data.replace('&#39;', "'")
 83        data = data.replace('&#34;', '"')
 84        if self.encoding and type(data) == type(u''):
 85            data = data.encode(self.encoding)
 86        sgmllib.SGMLParser.feed(self, data)
 87        sgmllib.SGMLParser.close(self)
 88
 89    def normalize_attrs(self, attrs):
 90        if not attrs: return attrs
 91        # utility method to be called by descendants
 92        attrs = dict([(k.lower(), v) for k, v in attrs]).items()
 93        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
 94        attrs.sort()
 95        return attrs
 96
 97    def unknown_starttag(self, tag, attrs):
 98        # called for each start tag
 99        # attrs is a list of (attr, value) tuples
100        # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
101        ## if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
102        uattrs = []
103        strattrs=''
104        if attrs:
105            for key, value in attrs:
106                value=value.replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
107                value = self.bare_ampersand.sub("&amp;", value)
108                # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
109                if type(value) != type(u''):
110                    try:
111                        value = unicode(value, self.encoding)
112                    except:
113                        value = unicode(value, 'iso-8859-1')
114                uattrs.append((unicode(key, self.encoding), value))
115            strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
116            if self.encoding:
117                try:
118                    strattrs=strattrs.encode(self.encoding)
119                except:
120                    pass
121        if tag in self.elements_no_end_tag:
122            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
123        else:
124            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
125
126    def unknown_endtag(self, tag):
127        # called for each end tag, e.g. for </pre>, tag will be 'pre'
128        # Reconstruct the original end tag.
129        if tag not in self.elements_no_end_tag:
130            self.pieces.append("</%(tag)s>" % locals())
131
132    def handle_charref(self, ref):
133        # called for each character reference, e.g. for '&#160;', ref will be '160'
134        # Reconstruct the original character reference.
135        if ref.startswith('x'):
136            value = unichr(int(ref[1:],16))
137        else:
138            value = unichr(int(ref))
139
140        if value in _cp1252.keys():
141            self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
142        else:
143            self.pieces.append('&#%(ref)s;' % locals())
144
145    def handle_entityref(self, ref):
146        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
147        # Reconstruct the original entity reference.
148        if name2codepoint.has_key(ref):
149            self.pieces.append('&%(ref)s;' % locals())
150        else:
151            self.pieces.append('&amp;%(ref)s' % locals())
152
153    def handle_data(self, text):
154        # called for each block of plain text, i.e. outside of any tag and
155        # not containing any character or entity references
156        # Store the original text verbatim.
157        ## if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
158        self.pieces.append(text)
159
160    def handle_comment(self, text):
161        # called for each HTML comment, e.g. <!-- insert Javascript code here -->
162        # Reconstruct the original comment.
163        self.pieces.append('<!--%(text)s-->' % locals())
164
165    def handle_pi(self, text):
166        # called for each processing instruction, e.g. <?instruction>
167        # Reconstruct original processing instruction.
168        self.pieces.append('<?%(text)s>' % locals())
169
170    def handle_decl(self, text):
171        # called for the DOCTYPE, if present, e.g.
172        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
173        #     "http://www.w3.org/TR/html4/loose.dtd">
174        # Reconstruct original DOCTYPE
175        self.pieces.append('<!%(text)s>' % locals())
176
177    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
178    def _scan_name(self, i, declstartpos):
179        rawdata = self.rawdata
180        n = len(rawdata)
181        if i == n:
182            return None, -1
183        m = self._new_declname_match(rawdata, i)
184        if m:
185            s = m.group()
186            name = s.strip()
187            if (i + len(s)) == n:
188                return None, -1  # end of buffer
189            return name.lower(), m.end()
190        else:
191            self.handle_data(rawdata)
192#            self.updatepos(declstartpos, i)
193            return None, -1
194
195    def convert_charref(self, name):
196        return '&#%s;' % name
197
198    def convert_entityref(self, name):
199        return '&%s;' % name
200
201    def output(self):
202        '''Return processed HTML as a single string'''
203        return ''.join([str(p) for p in self.pieces])
204
205class _HTMLSanitizer(_BaseHTMLProcessor):
206    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'article',
207      'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', 'canvas',
208      'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command',
209      'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir',
210      'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', 'figure', 'footer',
211      'font', 'form', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
212      'img', 'input', 'ins', 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map',
213      'menu', 'meter', 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup',
214      'option', 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
215      'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', 'sub',
216      'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', 'th', 'thead',
217      'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']
218
219    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
220      'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
221      'background', 'balance', 'bgcolor', 'bgproperties', 'border',
222      'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
223      'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
224      'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
225      'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
226      'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
227      'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
228      'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
229      'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
230      'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
231      'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
232      'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
233      'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
234      'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
235      'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
236      'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
237      'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
238      'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
239      'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
240      'xml:lang']
241
242    unacceptable_elements_with_end_tag = ['script', 'applet', 'style']
243
244    acceptable_css_properties = ['azimuth', 'background-color',
245      'border-bottom-color', 'border-collapse', 'border-color',
246      'border-left-color', 'border-right-color', 'border-top-color', 'clear',
247      'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
248      'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
249      'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
250      'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
251      'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
252      'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
253      'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
254      'white-space', 'width']
255
256    # survey of common keywords found in feeds
257    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
258      'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
259      'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
260      'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
261      'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
262      'transparent', 'underline', 'white', 'yellow']
263
264    valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
265      '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
266
267    mathml_elements = ['annotation', 'annotation-xml', 'maction', 'math',
268      'merror', 'mfenced', 'mfrac', 'mi', 'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded',
269      'mphantom', 'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle',
270      'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
271      'munderover', 'none', 'semantics']
272
273    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
274      'columnalign', 'close', 'columnlines', 'columnspacing', 'columnspan', 'depth',
275      'display', 'displaystyle', 'encoding', 'equalcolumns', 'equalrows',
276      'fence', 'fontstyle', 'fontweight', 'frame', 'height', 'linethickness',
277      'lspace', 'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant',
278      'maxsize', 'minsize', 'open', 'other', 'rowalign', 'rowalign', 'rowalign',
279      'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
280      'separator', 'separators', 'stretchy', 'width', 'width', 'xlink:href',
281      'xlink:show', 'xlink:type', 'xmlns', 'xmlns:xlink']
282
283    # svgtiny - foreignObject + linearGradient + radialGradient + stop
284    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
285      'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
286      'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
287      'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
288      'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
289      'svg', 'switch', 'text', 'title', 'tspan', 'use']
290
291    # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
292    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
293       'arabic-form', 'ascent', 'attributeName', 'attributeType',
294       'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
295       'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
296       'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
297       'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
298       'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
299       'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
300       'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
301       'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
302       'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
303       'min', 'name', 'offset', 'opacity', 'orient', 'origin',
304       'overline-position', 'overline-thickness', 'panose-1', 'path',
305       'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
306       'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
307       'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
308       'stop-color', 'stop-opacity', 'strikethrough-position',
309       'strikethrough-thickness', 'stroke', 'stroke-dasharray',
310       'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
311       'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
312       'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
313       'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
314       'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
315       'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
316       'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
317       'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
318       'y2', 'zoomAndPan']
319
320    svg_attr_map = None
321    svg_elem_map = None
322
323    acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
324      'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
325      'stroke-opacity']
326
327    def reset(self):
328        _BaseHTMLProcessor.reset(self)
329        self.unacceptablestack = 0
330        self.mathmlOK = 0
331        self.svgOK = 0
332
333    def unknown_starttag(self, tag, attrs):
334        acceptable_attributes = self.acceptable_attributes
335        keymap = {}
336        if not tag in self.acceptable_elements or self.svgOK:
337            if tag in self.unacceptable_elements_with_end_tag:
338                self.unacceptablestack += 1
339
340            # not otherwise acceptable, perhaps it is MathML or SVG?
341            if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
342                self.mathmlOK += 1
343            if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
344                self.svgOK += 1
345
346            # chose acceptable attributes based on tag class, else bail
347            if  self.mathmlOK and tag in self.mathml_elements:
348                acceptable_attributes = self.mathml_attributes
349            elif self.svgOK and tag in self.svg_elements:
350                # for most vocabularies, lowercasing is a good idea.  Many
351                # svg elements, however, are camel case
352                if not self.svg_attr_map:
353                    lower=[attr.lower() for attr in self.svg_attributes]
354                    mix=[a for a in self.svg_attributes if a not in lower]
355                    self.svg_attributes = lower
356                    self.svg_attr_map = dict([(a.lower(),a) for a in mix])
357
358                    lower=[attr.lower() for attr in self.svg_elements]
359                    mix=[a for a in self.svg_elements if a not in lower]
360                    self.svg_elements = lower
361                    self.svg_elem_map = dict([(a.lower(),a) for a in mix])
362                acceptable_attributes = self.svg_attributes
363                tag = self.svg_elem_map.get(tag,tag)
364                keymap = self.svg_attr_map
365            elif not tag in self.acceptable_elements:
366                return
367
368        # declare xlink namespace, if needed
369        if self.mathmlOK or self.svgOK:
370            if filter(lambda (n,v): n.startswith('xlink:'),attrs):
371                if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
372                    attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
373
374        clean_attrs = []
375        for key, value in self.normalize_attrs(attrs):
376            if key=="href" and value.strip().startswith("javascript"):
377                pass
378            elif key in acceptable_attributes:
379                key=keymap.get(key,key)
380                clean_attrs.append((key,value))
381            elif key=='style':
382                pass
383                ## clean_value = self.sanitize_style(value)
384                ## if clean_value: clean_attrs.append((key,clean_value))
385        _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
386
387    def unknown_endtag(self, tag):
388        if not tag in self.acceptable_elements:
389            if tag in self.unacceptable_elements_with_end_tag:
390                self.unacceptablestack -= 1
391            if self.mathmlOK and tag in self.mathml_elements:
392                if tag == 'math' and self.mathmlOK: self.mathmlOK -= 1
393            elif self.svgOK and tag in self.svg_elements:
394                tag = self.svg_elem_map.get(tag,tag)
395                if tag == 'svg' and self.svgOK: self.svgOK -= 1
396            else:
397                return
398        _BaseHTMLProcessor.unknown_endtag(self, tag)
399
400    def handle_pi(self, text):
401        pass
402
403    def handle_decl(self, text):
404        pass
405
406    def handle_data(self, text):
407        if not self.unacceptablestack:
408            _BaseHTMLProcessor.handle_data(self, text)
409
410    def sanitize_style(self, style):
411        # disallow urls
412        style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
413
414        # gauntlet
415        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
416        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
417
418        clean = []
419        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
420          if not value: continue
421          if prop.lower() in self.acceptable_css_properties:
422              clean.append(prop + ': ' + value + ';')
423          elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
424              for keyword in value.split():
425                  if not keyword in self.acceptable_css_keywords and \
426                      not self.valid_css_values.match(keyword):
427                      break
428              else:
429                  clean.append(prop + ': ' + value + ';')
430          elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
431              clean.append(prop + ': ' + value + ';')
432
433        return ' '.join(clean)
434
435
436def sanitize_html(htmlSource, encoding="utf-8", type="text/html"):
437    p = _HTMLSanitizer(encoding, type)
438    p.feed(htmlSource)
439    data = p.output()
440    data = data.strip().replace('\r\n', '\n')
441    return data