PageRenderTime 54ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/r2/r2/lib/filters.py

https://github.com/wangmxf/lesswrong
Python | 205 lines | 162 code | 16 blank | 27 comment | 4 complexity | c8c6373e186b61856f85a6d6f579a57a MD5 | raw file
Possible License(s): MPL-2.0-no-copyleft-exception, LGPL-2.1
  1. # The contents of this file are subject to the Common Public Attribution
  2. # License Version 1.0. (the "License"); you may not use this file except in
  3. # compliance with the License. You may obtain a copy of the License at
  4. # http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
  5. # License Version 1.1, but Sections 14 and 15 have been added to cover use of
  6. # software over a computer network and provide for limited attribution for the
  7. # Original Developer. In addition, Exhibit A has been modified to be consistent
  8. # with Exhibit B.
  9. #
  10. # Software distributed under the License is distributed on an "AS IS" basis,
  11. # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
  12. # the specific language governing rights and limitations under the License.
  13. #
  14. # The Original Code is Reddit.
  15. #
  16. # The Original Developer is the Initial Developer. The Initial Developer of the
  17. # Original Code is CondeNet, Inc.
  18. #
  19. # All portions of the code written by CondeNet are Copyright (c) 2006-2008
  20. # CondeNet, Inc. All Rights Reserved.
  21. ################################################################################
  22. from pylons import c
  23. import cgi
  24. import urllib
  25. import re
  26. import lxml.html
  27. from lxml.html import soupparser
  28. from lxml.html.clean import Cleaner, autolink_html
  29. MD_START = '<div class="md">'
  30. MD_END = '</div>'
  31. # Cleaner is initialised with differences to the defaults
  32. # embedded: We want to allow flash movies in posts
  33. # style: enable removal of style
  34. # safe_attrs_only: need to allow strange arguments to <object>
  35. sanitizer = Cleaner(embedded=False,safe_attrs_only=False)
  36. comment_sanitizer = Cleaner(embedded=False,style=True,safe_attrs_only=False)
  37. def python_websafe(text):
  38. return text.replace('&', "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace('"', "&quot;")
  39. def python_websafe_json(text):
  40. return text.replace('&', "&amp;").replace("<", "&lt;").replace(">", "&gt;")
  41. try:
  42. from Cfilters import uwebsafe as c_websafe, uwebsafe_json as c_websafe_json
  43. except ImportError:
  44. c_websafe = python_websafe
  45. c_websafe_json = python_websafe_json
  46. # There is a C implementation of this in Cfilters, but it's out-of-date and
  47. # currently unused.
  48. _spaces = re.compile(r'(\s)\s+')
  49. def spaceCompress(content):
  50. return _spaces.sub(r'\1', content.strip())
  51. class _Unsafe(unicode): pass
  52. def _force_unicode(text):
  53. try:
  54. text = unicode(text, 'utf-8', 'ignore')
  55. except TypeError:
  56. text = unicode(text)
  57. return text
  58. def _force_utf8(text):
  59. return str(_force_unicode(text).encode('utf8'))
  60. def _force_ascii(text):
  61. return _force_unicode(text).encode('ascii', 'ignore')
  62. def unsafe(text=''):
  63. return _Unsafe(_force_unicode(text))
  64. def unsafe_wrap_md(html=''):
  65. return unsafe(MD_START + html + MD_END)
  66. def websafe_json(text=""):
  67. return c_websafe_json(_force_unicode(text))
  68. def websafe(text=''):
  69. if text.__class__ == _Unsafe:
  70. return text
  71. elif text.__class__ != unicode:
  72. text = _force_unicode(text)
  73. return c_websafe(text)
  74. from mako.filters import url_escape
  75. def edit_comment_filter(text = ''):
  76. try:
  77. text = unicode(text, 'utf-8')
  78. except TypeError:
  79. text = unicode(text)
  80. return url_escape(text)
  81. #TODO is this fast?
  82. url_re = re.compile(r"""
  83. (\[[^\]]*\]:?)? # optional leading pair of square brackets
  84. \s* # optional whitespace
  85. (\()? # optional open bracket
  86. (?<![<]) # No angle around link already
  87. (http://[^\s\'\"\]\)]+) # a http uri
  88. (?![>]) # No angle around link already
  89. (\))? # optional close bracket
  90. """, re.VERBOSE)
  91. jscript_url = re.compile('<a href="(?!http|ftp|mailto|/).*</a>', re.I | re.S)
  92. href_re = re.compile('<a href="([^"]+)"', re.I | re.S)
  93. code_re = re.compile('<code>([^<]+)</code>')
  94. a_re = re.compile('>([^<]+)</a>')
  95. def wrap_urls(text):
  96. #wrap urls in "<>" so that markdown will handle them as urls
  97. matches = url_re.finditer(text)
  98. def check(match):
  99. square_brackets, open_bracket, link, close_bracket = match.groups()
  100. return match if link and not square_brackets else None
  101. matched = filter(None, [check(match) for match in matches])
  102. segments = []
  103. start = 0
  104. for match in matched:
  105. segments.extend([text[start:match.start(3)], '<', match.group(3), '>'])
  106. start = match.end(3)
  107. # Tack on any trailing bits
  108. segments.append(text[start:])
  109. return ''.join(segments)
  110. #TODO markdown should be looked up in batch?
  111. #@memoize('markdown')
  112. def safemarkdown(text, div=True):
  113. from contrib.markdown import markdown
  114. if text:
  115. # increase escaping of &, < and > once
  116. text = text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
  117. text = wrap_urls(text)
  118. try:
  119. text = markdown(text)
  120. except RuntimeError:
  121. text = "<p><em>Comment Broken</em></p>"
  122. #wipe malicious javascript
  123. text = jscript_url.sub('', text)
  124. def href_handler(m):
  125. x = m.group(1).replace('&amp;', '&')
  126. if c.cname:
  127. return '<a target="_top" href="%s"' % x
  128. else:
  129. return '<a href="%s"' % x
  130. def code_handler(m):
  131. l = m.group(1)
  132. return '<code>%s</code>' % l.replace('&amp;','&')
  133. #unescape double escaping in links
  134. def inner_a_handler(m):
  135. l = m.group(1)
  136. return '>%s</a>' % l.replace('&amp;','&')
  137. # remove the "&" escaping in urls
  138. text = href_re.sub(href_handler, text)
  139. text = code_re.sub(code_handler, text)
  140. text = a_re.sub(inner_a_handler, text)
  141. return MD_START + text + MD_END if div else text
  142. def keep_space(text):
  143. text = websafe(text)
  144. for i in " \n\r\t":
  145. text=text.replace(i,'&#%02d;' % ord(i))
  146. return unsafe(text)
  147. def unkeep_space(text):
  148. return text.replace('&#32;', ' ').replace('&#10;', '\n').replace('&#09;', '\t')
  149. whitespace_re = re.compile('^\s*$')
  150. def killhtml(html=''):
  151. html_doc = soupparser.fromstring(remove_control_chars(html))
  152. text = filter(lambda text: not whitespace_re.match(text), html_doc.itertext())
  153. cleaned_html = ' '.join([fragment.strip() for fragment in text])
  154. return cleaned_html
  155. control_chars = re.compile('[\x00-\x08\x0b\x0c\x0e-\x1f]') # Control characters *except* \t \r \n
  156. def remove_control_chars(text):
  157. return control_chars.sub('',text)
  158. def cleanhtml(html='', cleaner=None):
  159. html_doc = soupparser.fromstring(remove_control_chars(html))
  160. if not cleaner:
  161. cleaner = sanitizer
  162. cleaned_html = cleaner.clean_html(html_doc)
  163. return lxml.html.tostring(autolink_html(cleaned_html))
  164. def clean_comment_html(html=''):
  165. return cleanhtml(html, comment_sanitizer)
  166. block_tags = r'h1|h2|h3|h4|h5|h6|table|ol|dl|ul|menu|dir|p|pre|center|form|fieldset|select|blockquote|address|div|hr'
  167. linebreaks_re = re.compile(r'(\n{2}|\r{2}|(?:\r\n){2}|</?(?:%s)[^>]*?>)' % block_tags)
  168. tags_re = re.compile(r'</?(?:%s)' % block_tags)
  169. def format_linebreaks(html=''):
  170. paragraphs = ['<p>%s</p>' % p if not tags_re.match(p) else p
  171. for p in linebreaks_re.split(html.strip())
  172. if not whitespace_re.match(p)]
  173. return ''.join(paragraphs)