/tornado/escape.py
Python | 327 lines | 206 code | 32 blank | 89 comment | 38 complexity | e6e9087e777b8e22fa840b7a493511f1 MD5 | raw file
- #!/usr/bin/env python
- #
- # Copyright 2009 Facebook
- #
- # Licensed under the Apache License, Version 2.0 (the "License"); you may
- # not use this file except in compliance with the License. You may obtain
- # a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- # License for the specific language governing permissions and limitations
- # under the License.
- """Escaping/unescaping methods for HTML, JSON, URLs, and others.
- Also includes a few other miscellaneous string manipulation functions that
- have crept in over time.
- """
- import htmlentitydefs
- import re
- import sys
- import urllib
- # Python3 compatibility: On python2.5, introduce the bytes alias from 2.6
- try: bytes
- except Exception: bytes = str
- try:
- from urlparse import parse_qs # Python 2.6+
- except ImportError:
- from cgi import parse_qs
- # json module is in the standard library as of python 2.6; fall back to
- # simplejson if present for older versions.
- try:
- import json
- assert hasattr(json, "loads") and hasattr(json, "dumps")
- _json_decode = json.loads
- _json_encode = json.dumps
- except Exception:
- try:
- import simplejson
- _json_decode = lambda s: simplejson.loads(_unicode(s))
- _json_encode = lambda v: simplejson.dumps(v)
- except ImportError:
- try:
- # For Google AppEngine
- from django.utils import simplejson
- _json_decode = lambda s: simplejson.loads(_unicode(s))
- _json_encode = lambda v: simplejson.dumps(v)
- except ImportError:
- def _json_decode(s):
- raise NotImplementedError(
- "A JSON parser is required, e.g., simplejson at "
- "http://pypi.python.org/pypi/simplejson/")
- _json_encode = _json_decode
- _XHTML_ESCAPE_RE = re.compile('[&<>"]')
- _XHTML_ESCAPE_DICT = {'&': '&', '<': '<', '>': '>', '"': '"'}
- def xhtml_escape(value):
- """Escapes a string so it is valid within XML or XHTML."""
- return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
- to_basestring(value))
- def xhtml_unescape(value):
- """Un-escapes an XML-escaped string."""
- return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
- def json_encode(value):
- """JSON-encodes the given Python object."""
- # JSON permits but does not require forward slashes to be escaped.
- # This is useful when json data is emitted in a <script> tag
- # in HTML, as it prevents </script> tags from prematurely terminating
- # the javscript. Some json libraries do this escaping by default,
- # although python's standard library does not, so we do it here.
- # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
- return _json_encode(recursive_unicode(value)).replace("</", "<\\/")
- def json_decode(value):
- """Returns Python objects for the given JSON string."""
- return _json_decode(to_basestring(value))
- def squeeze(value):
- """Replace all sequences of whitespace chars with a single space."""
- return re.sub(r"[\x00-\x20]+", " ", value).strip()
- def url_escape(value):
- """Returns a valid URL-encoded version of the given value."""
- return urllib.quote_plus(utf8(value))
- # python 3 changed things around enough that we need two separate
- # implementations of url_unescape. We also need our own implementation
- # of parse_qs since python 3's version insists on decoding everything.
- if sys.version_info[0] < 3:
- def url_unescape(value, encoding='utf-8'):
- """Decodes the given value from a URL.
- The argument may be either a byte or unicode string.
- If encoding is None, the result will be a byte string. Otherwise,
- the result is a unicode string in the specified encoding.
- """
- if encoding is None:
- return urllib.unquote_plus(utf8(value))
- else:
- return unicode(urllib.unquote_plus(utf8(value)), encoding)
- parse_qs_bytes = parse_qs
- else:
- def url_unescape(value, encoding='utf-8'):
- """Decodes the given value from a URL.
- The argument may be either a byte or unicode string.
- If encoding is None, the result will be a byte string. Otherwise,
- the result is a unicode string in the specified encoding.
- """
- if encoding is None:
- return urllib.parse.unquote_to_bytes(value)
- else:
- return urllib.unquote_plus(to_basestring(value), encoding=encoding)
- def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
- """Parses a query string like urlparse.parse_qs, but returns the
- values as byte strings.
- Keys still become type str (interpreted as latin1 in python3!)
- because it's too painful to keep them as byte strings in
- python3 and in practice they're nearly always ascii anyway.
- """
- # This is gross, but python3 doesn't give us another way.
- # Latin1 is the universal donor of character encodings.
- result = parse_qs(qs, keep_blank_values, strict_parsing,
- encoding='latin1', errors='strict')
- encoded = {}
- for k,v in result.iteritems():
- encoded[k] = [i.encode('latin1') for i in v]
- return encoded
-
- _UTF8_TYPES = (bytes, type(None))
- def utf8(value):
- """Converts a string argument to a byte string.
- If the argument is already a byte string or None, it is returned unchanged.
- Otherwise it must be a unicode string and is encoded as utf8.
- """
- if isinstance(value, _UTF8_TYPES):
- return value
- assert isinstance(value, unicode)
- return value.encode("utf-8")
- _TO_UNICODE_TYPES = (unicode, type(None))
- def to_unicode(value):
- """Converts a string argument to a unicode string.
- If the argument is already a unicode string or None, it is returned
- unchanged. Otherwise it must be a byte string and is decoded as utf8.
- """
- if isinstance(value, _TO_UNICODE_TYPES):
- return value
- assert isinstance(value, bytes)
- return value.decode("utf-8")
- # to_unicode was previously named _unicode not because it was private,
- # but to avoid conflicts with the built-in unicode() function/type
- _unicode = to_unicode
- # When dealing with the standard library across python 2 and 3 it is
- # sometimes useful to have a direct conversion to the native string type
- if str is unicode:
- native_str = to_unicode
- else:
- native_str = utf8
- _BASESTRING_TYPES = (basestring, type(None))
- def to_basestring(value):
- """Converts a string argument to a subclass of basestring.
- In python2, byte and unicode strings are mostly interchangeable,
- so functions that deal with a user-supplied argument in combination
- with ascii string constants can use either and should return the type
- the user supplied. In python3, the two types are not interchangeable,
- so this method is needed to convert byte strings to unicode.
- """
- if isinstance(value, _BASESTRING_TYPES):
- return value
- assert isinstance(value, bytes)
- return value.decode("utf-8")
- def recursive_unicode(obj):
- """Walks a simple data structure, converting byte strings to unicode.
- Supports lists, tuples, and dictionaries.
- """
- if isinstance(obj, dict):
- return dict((recursive_unicode(k), recursive_unicode(v)) for (k,v) in obj.iteritems())
- elif isinstance(obj, list):
- return list(recursive_unicode(i) for i in obj)
- elif isinstance(obj, tuple):
- return tuple(recursive_unicode(i) for i in obj)
- elif isinstance(obj, bytes):
- return to_unicode(obj)
- else:
- return obj
- # I originally used the regex from
- # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
- # but it gets all exponential on certain patterns (such as too many trailing
- # dots), causing the regex matcher to never return.
- # This regex should avoid those problems.
- _URL_RE = re.compile(ur"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)""")
- def linkify(text, shorten=False, extra_params="",
- require_protocol=False, permitted_protocols=["http", "https"]):
- """Converts plain text into HTML with links.
- For example: ``linkify("Hello http://tornadoweb.org!")`` would return
- ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
- Parameters:
- shorten: Long urls will be shortened for display.
- extra_params: Extra text to include in the link tag,
- e.g. linkify(text, extra_params='rel="nofollow" class="external"')
- require_protocol: Only linkify urls which include a protocol. If this is
- False, urls such as www.facebook.com will also be linkified.
- permitted_protocols: List (or set) of protocols which should be linkified,
- e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
- It is very unsafe to include protocols such as "javascript".
- """
- if extra_params:
- extra_params = " " + extra_params.strip()
- def make_link(m):
- url = m.group(1)
- proto = m.group(2)
- if require_protocol and not proto:
- return url # not protocol, no linkify
- if proto and proto not in permitted_protocols:
- return url # bad protocol, no linkify
- href = m.group(1)
- if not proto:
- href = "http://" + href # no proto specified, use http
- params = extra_params
- # clip long urls. max_len is just an approximation
- max_len = 30
- if shorten and len(url) > max_len:
- before_clip = url
- if proto:
- proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :
- else:
- proto_len = 0
- parts = url[proto_len:].split("/")
- if len(parts) > 1:
- # Grab the whole host part plus the first bit of the path
- # The path is usually not that interesting once shortened
- # (no more slug, etc), so it really just provides a little
- # extra indication of shortening.
- url = url[:proto_len] + parts[0] + "/" + \
- parts[1][:8].split('?')[0].split('.')[0]
- if len(url) > max_len * 1.5: # still too long
- url = url[:max_len]
- if url != before_clip:
- amp = url.rfind('&')
- # avoid splitting html char entities
- if amp > max_len - 5:
- url = url[:amp]
- url += "..."
- if len(url) >= len(before_clip):
- url = before_clip
- else:
- # full url is visible on mouse-over (for those who don't
- # have a status bar, such as Safari by default)
- params += ' title="%s"' % href
- return u'<a href="%s"%s>%s</a>' % (href, params, url)
- # First HTML-escape so that our strings are all safe.
- # The regex is modified to avoid character entites other than & so
- # that we won't pick up ", etc.
- text = _unicode(xhtml_escape(text))
- return _URL_RE.sub(make_link, text)
- def _convert_entity(m):
- if m.group(1) == "#":
- try:
- return unichr(int(m.group(2)))
- except ValueError:
- return "&#%s;" % m.group(2)
- try:
- return _HTML_UNICODE_MAP[m.group(2)]
- except KeyError:
- return "&%s;" % m.group(2)
- def _build_unicode_map():
- unicode_map = {}
- for name, value in htmlentitydefs.name2codepoint.iteritems():
- unicode_map[name] = unichr(value)
- return unicode_map
- _HTML_UNICODE_MAP = _build_unicode_map()