PageRenderTime 52ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 0ms

/r2/r2/lib/filters.py

https://github.com/stevewilber/reddit
Python | 272 lines | 199 code | 45 blank | 28 comment | 27 complexity | 025ca8dc10b2e260e9360fde3216e215 MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, Apache-2.0
  1. # The contents of this file are subject to the Common Public Attribution
  2. # License Version 1.0. (the "License"); you may not use this file except in
  3. # compliance with the License. You may obtain a copy of the License at
  4. # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
  5. # License Version 1.1, but Sections 14 and 15 have been added to cover use of
  6. # software over a computer network and provide for limited attribution for the
  7. # Original Developer. In addition, Exhibit A has been modified to be consistent
  8. # with Exhibit B.
  9. #
  10. # Software distributed under the License is distributed on an "AS IS" basis,
  11. # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
  12. # the specific language governing rights and limitations under the License.
  13. #
  14. # The Original Code is reddit.
  15. #
  16. # The Original Developer is the Initial Developer. The Initial Developer of
  17. # the Original Code is reddit Inc.
  18. #
  19. # All portions of the code written by reddit are Copyright (c) 2006-2012 reddit
  20. # Inc. All Rights Reserved.
  21. ###############################################################################
  22. import cgi
  23. import os
  24. import urllib
  25. import re
  26. import snudown
  27. from cStringIO import StringIO
  28. from xml.sax.handler import ContentHandler
  29. from lxml.sax import saxify
  30. import lxml.etree
  31. from BeautifulSoup import BeautifulSoup
  32. from pylons import g, c
  33. from wrapped import Templated, CacheStub
  34. SC_OFF = "<!-- SC_OFF -->"
  35. SC_ON = "<!-- SC_ON -->"
  36. MD_START = '<div class="md">'
  37. MD_END = '</div>'
  38. WIKI_MD_START = '<div class="md wiki">'
  39. WIKI_MD_END = '</div>'
  40. custom_img_url = re.compile(r'\A%%([a-zA-Z0-9\-]+)%%$')
  41. def python_websafe(text):
  42. return text.replace('&', "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace('"', "&quot;")
  43. def python_websafe_json(text):
  44. return text.replace('&', "&amp;").replace("<", "&lt;").replace(">", "&gt;")
  45. try:
  46. from Cfilters import uwebsafe as c_websafe, uspace_compress, \
  47. uwebsafe_json as c_websafe_json
  48. def spaceCompress(text):
  49. try:
  50. text = unicode(text, 'utf-8')
  51. except TypeError:
  52. text = unicode(text)
  53. return uspace_compress(text)
  54. except ImportError:
  55. c_websafe = python_websafe
  56. c_websafe_json = python_websafe_json
  57. _between_tags1 = re.compile('> +')
  58. _between_tags2 = re.compile(' +<')
  59. _spaces = re.compile('[\s]+')
  60. _ignore = re.compile('(' + SC_OFF + '|' + SC_ON + ')', re.S | re.I)
  61. def spaceCompress(content):
  62. res = ''
  63. sc = True
  64. for p in _ignore.split(content):
  65. if p == SC_ON:
  66. sc = True
  67. elif p == SC_OFF:
  68. sc = False
  69. elif sc:
  70. p = _spaces.sub(' ', p)
  71. p = _between_tags1.sub('>', p)
  72. p = _between_tags2.sub('<', p)
  73. res += p
  74. else:
  75. res += p
  76. return res
  77. class _Unsafe(unicode): pass
  78. def _force_unicode(text):
  79. if text == None:
  80. return u''
  81. if isinstance(text, unicode):
  82. return text
  83. try:
  84. text = unicode(text, 'utf-8')
  85. except UnicodeDecodeError:
  86. text = unicode(text, 'latin1')
  87. except TypeError:
  88. text = unicode(text)
  89. return text
  90. def _force_utf8(text):
  91. return str(_force_unicode(text).encode('utf8'))
  92. def unsafe(text=''):
  93. return _Unsafe(_force_unicode(text))
  94. def websafe_json(text=""):
  95. return c_websafe_json(_force_unicode(text))
  96. def mako_websafe(text = ''):
  97. if text.__class__ == _Unsafe:
  98. return text
  99. elif isinstance(text, Templated):
  100. return _Unsafe(text.render())
  101. elif isinstance(text, CacheStub):
  102. return _Unsafe(text)
  103. elif text is None:
  104. return ""
  105. elif text.__class__ != unicode:
  106. text = _force_unicode(text)
  107. return c_websafe(text)
  108. def websafe(text=''):
  109. if text.__class__ != unicode:
  110. text = _force_unicode(text)
  111. #wrap the response in _Unsafe so make_websafe doesn't unescape it
  112. return _Unsafe(c_websafe(text))
  113. valid_link_schemes = (
  114. '/',
  115. '#',
  116. 'http://',
  117. 'https://',
  118. 'ftp://',
  119. 'mailto:',
  120. 'steam://',
  121. 'irc://',
  122. 'ircs://',
  123. 'news://',
  124. 'mumble://',
  125. 'ssh://',
  126. 'git://',
  127. )
  128. class SouptestSaxHandler(ContentHandler):
  129. def __init__(self, ok_tags):
  130. self.ok_tags = ok_tags
  131. def startElementNS(self, tagname, qname, attrs):
  132. if qname not in self.ok_tags:
  133. raise ValueError('HAX: Unknown tag: %r' % qname)
  134. for (ns, name), val in attrs.items():
  135. if ns is not None:
  136. raise ValueError('HAX: Unknown namespace? Seriously? %r' % ns)
  137. if name not in self.ok_tags[qname]:
  138. raise ValueError('HAX: Unknown attribute-name %r' % name)
  139. if qname == 'a' and name == 'href':
  140. lv = val.lower()
  141. if not any(lv.startswith(scheme) for scheme in valid_link_schemes):
  142. raise ValueError('HAX: Unsupported link scheme %r' % val)
  143. markdown_ok_tags = {
  144. 'div': ('class'),
  145. 'a': set(('href', 'title', 'target', 'nofollow')),
  146. }
  147. markdown_boring_tags = ('p', 'em', 'strong', 'br', 'ol', 'ul', 'hr', 'li',
  148. 'pre', 'code', 'blockquote', 'center',
  149. 'sup', 'del', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',)
  150. markdown_user_tags = ('table', 'th', 'tr', 'td', 'tbody', 'img',
  151. 'tbody', 'thead', 'tr', 'tfoot', 'caption')
  152. for bt in markdown_boring_tags:
  153. markdown_ok_tags[bt] = ()
  154. for bt in markdown_user_tags:
  155. markdown_ok_tags[bt] = ('colspan', 'rowspan', 'cellspacing', 'cellpadding', 'align', 'scope')
  156. markdown_xhtml_dtd_path = os.path.join(
  157. os.path.dirname(os.path.abspath(__file__)),
  158. 'contrib/dtds/xhtml.dtd')
  159. markdown_dtd = '<!DOCTYPE div- SYSTEM "file://%s">' % markdown_xhtml_dtd_path
  160. def markdown_souptest(text, nofollow=False, target=None):
  161. if not text:
  162. return text
  163. smd = safemarkdown(text, nofollow=nofollow, target=target)
  164. # Prepend a DTD reference so we can load up definitions of all the standard
  165. # XHTML entities (&nbsp;, etc.).
  166. smd_with_dtd = markdown_dtd + smd
  167. s = StringIO(smd_with_dtd)
  168. parser = lxml.etree.XMLParser(load_dtd=True)
  169. tree = lxml.etree.parse(s, parser)
  170. handler = SouptestSaxHandler(markdown_ok_tags)
  171. saxify(tree, handler)
  172. return smd
  173. #TODO markdown should be looked up in batch?
  174. #@memoize('markdown')
  175. def safemarkdown(text, nofollow=False, wrap=True, **kwargs):
  176. if not text:
  177. return None
  178. # this lets us skip the c.cname lookup (which is apparently quite
  179. # slow) if target was explicitly passed to this function.
  180. target = kwargs.get("target", None)
  181. if "target" not in kwargs and c.cname:
  182. target = "_top"
  183. text = snudown.markdown(_force_utf8(text), nofollow, target)
  184. if wrap:
  185. return SC_OFF + MD_START + text + MD_END + SC_ON
  186. else:
  187. return SC_OFF + text + SC_ON
  188. def wikimarkdown(text):
  189. from r2.lib.cssfilter import legacy_s3_url
  190. def img_swap(tag):
  191. name = tag.get('src')
  192. name = custom_img_url.search(name)
  193. name = name and name.group(1)
  194. if name and c.site.images.has_key(name):
  195. url = c.site.images[name]
  196. url = legacy_s3_url(url, c.site)
  197. tag['src'] = url
  198. else:
  199. tag.extract()
  200. nofollow = True
  201. target = None
  202. text = snudown.markdown(_force_utf8(text), nofollow, target,
  203. renderer=snudown.RENDERER_WIKI, enable_toc=True)
  204. # TODO: We should test how much of a load this adds to the app
  205. soup = BeautifulSoup(text)
  206. images = soup.findAll('img')
  207. if images:
  208. [img_swap(image) for image in images]
  209. text = str(soup)
  210. return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
  211. def keep_space(text):
  212. text = websafe(text)
  213. for i in " \n\r\t":
  214. text=text.replace(i,'&#%02d;' % ord(i))
  215. return unsafe(text)
  216. def unkeep_space(text):
  217. return text.replace('&#32;', ' ').replace('&#10;', '\n').replace('&#09;', '\t')