PageRenderTime 88ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/tornado/escape.py

https://github.com/daharon/tornado
Python | 327 lines | 206 code | 32 blank | 89 comment | 38 complexity | e6e9087e777b8e22fa840b7a493511f1 MD5 | raw file
  1. #!/usr/bin/env python
  2. #
  3. # Copyright 2009 Facebook
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  6. # not use this file except in compliance with the License. You may obtain
  7. # a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. # License for the specific language governing permissions and limitations
  15. # under the License.
  16. """Escaping/unescaping methods for HTML, JSON, URLs, and others.
  17. Also includes a few other miscellaneous string manipulation functions that
  18. have crept in over time.
  19. """
  20. import htmlentitydefs
  21. import re
  22. import sys
  23. import urllib
  24. # Python3 compatibility: On python2.5, introduce the bytes alias from 2.6
  25. try: bytes
  26. except Exception: bytes = str
  27. try:
  28. from urlparse import parse_qs # Python 2.6+
  29. except ImportError:
  30. from cgi import parse_qs
  31. # json module is in the standard library as of python 2.6; fall back to
  32. # simplejson if present for older versions.
  33. try:
  34. import json
  35. assert hasattr(json, "loads") and hasattr(json, "dumps")
  36. _json_decode = json.loads
  37. _json_encode = json.dumps
  38. except Exception:
  39. try:
  40. import simplejson
  41. _json_decode = lambda s: simplejson.loads(_unicode(s))
  42. _json_encode = lambda v: simplejson.dumps(v)
  43. except ImportError:
  44. try:
  45. # For Google AppEngine
  46. from django.utils import simplejson
  47. _json_decode = lambda s: simplejson.loads(_unicode(s))
  48. _json_encode = lambda v: simplejson.dumps(v)
  49. except ImportError:
  50. def _json_decode(s):
  51. raise NotImplementedError(
  52. "A JSON parser is required, e.g., simplejson at "
  53. "http://pypi.python.org/pypi/simplejson/")
  54. _json_encode = _json_decode
  55. _XHTML_ESCAPE_RE = re.compile('[&<>"]')
  56. _XHTML_ESCAPE_DICT = {'&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;'}
  57. def xhtml_escape(value):
  58. """Escapes a string so it is valid within XML or XHTML."""
  59. return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
  60. to_basestring(value))
  61. def xhtml_unescape(value):
  62. """Un-escapes an XML-escaped string."""
  63. return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
  64. def json_encode(value):
  65. """JSON-encodes the given Python object."""
  66. # JSON permits but does not require forward slashes to be escaped.
  67. # This is useful when json data is emitted in a <script> tag
  68. # in HTML, as it prevents </script> tags from prematurely terminating
  69. # the javscript. Some json libraries do this escaping by default,
  70. # although python's standard library does not, so we do it here.
  71. # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
  72. return _json_encode(recursive_unicode(value)).replace("</", "<\\/")
  73. def json_decode(value):
  74. """Returns Python objects for the given JSON string."""
  75. return _json_decode(to_basestring(value))
  76. def squeeze(value):
  77. """Replace all sequences of whitespace chars with a single space."""
  78. return re.sub(r"[\x00-\x20]+", " ", value).strip()
  79. def url_escape(value):
  80. """Returns a valid URL-encoded version of the given value."""
  81. return urllib.quote_plus(utf8(value))
  82. # python 3 changed things around enough that we need two separate
  83. # implementations of url_unescape. We also need our own implementation
  84. # of parse_qs since python 3's version insists on decoding everything.
  85. if sys.version_info[0] < 3:
  86. def url_unescape(value, encoding='utf-8'):
  87. """Decodes the given value from a URL.
  88. The argument may be either a byte or unicode string.
  89. If encoding is None, the result will be a byte string. Otherwise,
  90. the result is a unicode string in the specified encoding.
  91. """
  92. if encoding is None:
  93. return urllib.unquote_plus(utf8(value))
  94. else:
  95. return unicode(urllib.unquote_plus(utf8(value)), encoding)
  96. parse_qs_bytes = parse_qs
  97. else:
  98. def url_unescape(value, encoding='utf-8'):
  99. """Decodes the given value from a URL.
  100. The argument may be either a byte or unicode string.
  101. If encoding is None, the result will be a byte string. Otherwise,
  102. the result is a unicode string in the specified encoding.
  103. """
  104. if encoding is None:
  105. return urllib.parse.unquote_to_bytes(value)
  106. else:
  107. return urllib.unquote_plus(to_basestring(value), encoding=encoding)
  108. def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
  109. """Parses a query string like urlparse.parse_qs, but returns the
  110. values as byte strings.
  111. Keys still become type str (interpreted as latin1 in python3!)
  112. because it's too painful to keep them as byte strings in
  113. python3 and in practice they're nearly always ascii anyway.
  114. """
  115. # This is gross, but python3 doesn't give us another way.
  116. # Latin1 is the universal donor of character encodings.
  117. result = parse_qs(qs, keep_blank_values, strict_parsing,
  118. encoding='latin1', errors='strict')
  119. encoded = {}
  120. for k,v in result.iteritems():
  121. encoded[k] = [i.encode('latin1') for i in v]
  122. return encoded
  123. _UTF8_TYPES = (bytes, type(None))
  124. def utf8(value):
  125. """Converts a string argument to a byte string.
  126. If the argument is already a byte string or None, it is returned unchanged.
  127. Otherwise it must be a unicode string and is encoded as utf8.
  128. """
  129. if isinstance(value, _UTF8_TYPES):
  130. return value
  131. assert isinstance(value, unicode)
  132. return value.encode("utf-8")
  133. _TO_UNICODE_TYPES = (unicode, type(None))
  134. def to_unicode(value):
  135. """Converts a string argument to a unicode string.
  136. If the argument is already a unicode string or None, it is returned
  137. unchanged. Otherwise it must be a byte string and is decoded as utf8.
  138. """
  139. if isinstance(value, _TO_UNICODE_TYPES):
  140. return value
  141. assert isinstance(value, bytes)
  142. return value.decode("utf-8")
  143. # to_unicode was previously named _unicode not because it was private,
  144. # but to avoid conflicts with the built-in unicode() function/type
  145. _unicode = to_unicode
  146. # When dealing with the standard library across python 2 and 3 it is
  147. # sometimes useful to have a direct conversion to the native string type
  148. if str is unicode:
  149. native_str = to_unicode
  150. else:
  151. native_str = utf8
  152. _BASESTRING_TYPES = (basestring, type(None))
  153. def to_basestring(value):
  154. """Converts a string argument to a subclass of basestring.
  155. In python2, byte and unicode strings are mostly interchangeable,
  156. so functions that deal with a user-supplied argument in combination
  157. with ascii string constants can use either and should return the type
  158. the user supplied. In python3, the two types are not interchangeable,
  159. so this method is needed to convert byte strings to unicode.
  160. """
  161. if isinstance(value, _BASESTRING_TYPES):
  162. return value
  163. assert isinstance(value, bytes)
  164. return value.decode("utf-8")
  165. def recursive_unicode(obj):
  166. """Walks a simple data structure, converting byte strings to unicode.
  167. Supports lists, tuples, and dictionaries.
  168. """
  169. if isinstance(obj, dict):
  170. return dict((recursive_unicode(k), recursive_unicode(v)) for (k,v) in obj.iteritems())
  171. elif isinstance(obj, list):
  172. return list(recursive_unicode(i) for i in obj)
  173. elif isinstance(obj, tuple):
  174. return tuple(recursive_unicode(i) for i in obj)
  175. elif isinstance(obj, bytes):
  176. return to_unicode(obj)
  177. else:
  178. return obj
  179. # I originally used the regex from
  180. # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
  181. # but it gets all exponential on certain patterns (such as too many trailing
  182. # dots), causing the regex matcher to never return.
  183. # This regex should avoid those problems.
  184. _URL_RE = re.compile(ur"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""")
  185. def linkify(text, shorten=False, extra_params="",
  186. require_protocol=False, permitted_protocols=["http", "https"]):
  187. """Converts plain text into HTML with links.
  188. For example: ``linkify("Hello http://tornadoweb.org!")`` would return
  189. ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
  190. Parameters:
  191. shorten: Long urls will be shortened for display.
  192. extra_params: Extra text to include in the link tag,
  193. e.g. linkify(text, extra_params='rel="nofollow" class="external"')
  194. require_protocol: Only linkify urls which include a protocol. If this is
  195. False, urls such as www.facebook.com will also be linkified.
  196. permitted_protocols: List (or set) of protocols which should be linkified,
  197. e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
  198. It is very unsafe to include protocols such as "javascript".
  199. """
  200. if extra_params:
  201. extra_params = " " + extra_params.strip()
  202. def make_link(m):
  203. url = m.group(1)
  204. proto = m.group(2)
  205. if require_protocol and not proto:
  206. return url # not protocol, no linkify
  207. if proto and proto not in permitted_protocols:
  208. return url # bad protocol, no linkify
  209. href = m.group(1)
  210. if not proto:
  211. href = "http://" + href # no proto specified, use http
  212. params = extra_params
  213. # clip long urls. max_len is just an approximation
  214. max_len = 30
  215. if shorten and len(url) > max_len:
  216. before_clip = url
  217. if proto:
  218. proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :
  219. else:
  220. proto_len = 0
  221. parts = url[proto_len:].split("/")
  222. if len(parts) > 1:
  223. # Grab the whole host part plus the first bit of the path
  224. # The path is usually not that interesting once shortened
  225. # (no more slug, etc), so it really just provides a little
  226. # extra indication of shortening.
  227. url = url[:proto_len] + parts[0] + "/" + \
  228. parts[1][:8].split('?')[0].split('.')[0]
  229. if len(url) > max_len * 1.5: # still too long
  230. url = url[:max_len]
  231. if url != before_clip:
  232. amp = url.rfind('&')
  233. # avoid splitting html char entities
  234. if amp > max_len - 5:
  235. url = url[:amp]
  236. url += "..."
  237. if len(url) >= len(before_clip):
  238. url = before_clip
  239. else:
  240. # full url is visible on mouse-over (for those who don't
  241. # have a status bar, such as Safari by default)
  242. params += ' title="%s"' % href
  243. return u'<a href="%s"%s>%s</a>' % (href, params, url)
  244. # First HTML-escape so that our strings are all safe.
  245. # The regex is modified to avoid character entites other than &amp; so
  246. # that we won't pick up &quot;, etc.
  247. text = _unicode(xhtml_escape(text))
  248. return _URL_RE.sub(make_link, text)
  249. def _convert_entity(m):
  250. if m.group(1) == "#":
  251. try:
  252. return unichr(int(m.group(2)))
  253. except ValueError:
  254. return "&#%s;" % m.group(2)
  255. try:
  256. return _HTML_UNICODE_MAP[m.group(2)]
  257. except KeyError:
  258. return "&%s;" % m.group(2)
  259. def _build_unicode_map():
  260. unicode_map = {}
  261. for name, value in htmlentitydefs.name2codepoint.iteritems():
  262. unicode_map[name] = unichr(value)
  263. return unicode_map
  264. _HTML_UNICODE_MAP = _build_unicode_map()