lesswrong /r2/r2/lib/filters.py

Language Python Lines 206
MD5 Hash c8c6373e186b61856f85a6d6f579a57a
Repository https://github.com/wangmxf/lesswrong.git View Raw File
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
# 
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
# 
# The Original Code is Reddit.
# 
# The Original Developer is the Initial Developer.  The Initial Developer of the
# Original Code is CondeNet, Inc.
# 
# All portions of the code written by CondeNet are Copyright (c) 2006-2008
# CondeNet, Inc. All Rights Reserved.
################################################################################
from pylons import c

import cgi
import urllib
import re

import lxml.html
from lxml.html import soupparser
from lxml.html.clean import Cleaner, autolink_html

MD_START = '<div class="md">'
MD_END = '</div>'


# Cleaner is initialised with differences to the defaults
# embedded: We want to allow flash movies in posts
# style: enable removal of style
# safe_attrs_only: need to allow strange arguments to <object>
sanitizer = Cleaner(embedded=False,safe_attrs_only=False)
comment_sanitizer = Cleaner(embedded=False,style=True,safe_attrs_only=False)

def python_websafe(text):
    return text.replace('&', "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace('"', "&quot;")

def python_websafe_json(text):
    return text.replace('&', "&amp;").replace("<", "&lt;").replace(">", "&gt;")

try:
    from Cfilters import uwebsafe as c_websafe, uwebsafe_json as c_websafe_json
except ImportError:
    c_websafe      = python_websafe
    c_websafe_json = python_websafe_json

# There is a C implementation of this in Cfilters, but it's out-of-date and
# currently unused.
_spaces = re.compile(r'(\s)\s+')
def spaceCompress(content):
    return _spaces.sub(r'\1', content.strip())

class _Unsafe(unicode): pass

def _force_unicode(text):
    try:
        text = unicode(text, 'utf-8', 'ignore')
    except TypeError:
        text = unicode(text)
    return text

def _force_utf8(text):
    return str(_force_unicode(text).encode('utf8'))

def _force_ascii(text):
    return _force_unicode(text).encode('ascii', 'ignore')

def unsafe(text=''):
    return _Unsafe(_force_unicode(text))

def unsafe_wrap_md(html=''):
    return unsafe(MD_START + html + MD_END)

def websafe_json(text=""):
    return c_websafe_json(_force_unicode(text))

def websafe(text=''):
    if text.__class__ == _Unsafe:
        return text
    elif text.__class__ != unicode:
        text = _force_unicode(text)
    return c_websafe(text)

from mako.filters import url_escape
def edit_comment_filter(text = ''):
    try:
        text = unicode(text, 'utf-8')
    except TypeError:
        text = unicode(text)
    return url_escape(text)

#TODO is this fast?
url_re = re.compile(r"""
    (\[[^\]]*\]:?)?         # optional leading pair of square brackets
    \s*                     # optional whitespace
    (\()?                   # optional open bracket
    (?<![<])                # No angle around link already
    (http://[^\s\'\"\]\)]+) # a http uri
    (?![>])                 # No angle around link already
    (\))?                   # optional close bracket
    """, re.VERBOSE)
jscript_url = re.compile('<a href="(?!http|ftp|mailto|/).*</a>', re.I | re.S)
href_re = re.compile('<a href="([^"]+)"', re.I | re.S)
code_re = re.compile('<code>([^<]+)</code>')
a_re    = re.compile('>([^<]+)</a>')

def wrap_urls(text):
    #wrap urls in "<>" so that markdown will handle them as urls
    matches = url_re.finditer(text)
    def check(match):
        square_brackets, open_bracket, link, close_bracket = match.groups()
        return match if link and not square_brackets else None

    matched = filter(None, [check(match) for match in matches])
    segments = []
    start = 0
    for match in matched:
        segments.extend([text[start:match.start(3)], '<', match.group(3), '>'])
        start = match.end(3)

    # Tack on any trailing bits
    segments.append(text[start:])

    return ''.join(segments)

#TODO markdown should be looked up in batch?
#@memoize('markdown')
def safemarkdown(text, div=True):
    from contrib.markdown import markdown
    if text:
        # increase escaping of &, < and > once
        text = text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;") 
        text = wrap_urls(text)

        try:
            text = markdown(text)
        except RuntimeError:
            text = "<p><em>Comment Broken</em></p>"
        #wipe malicious javascript
        text = jscript_url.sub('', text)
        def href_handler(m):
            x = m.group(1).replace('&amp;', '&')
            if c.cname:
                return '<a target="_top" href="%s"' % x
            else:
                return '<a href="%s"' % x
        def code_handler(m):
            l = m.group(1)
            return '<code>%s</code>' % l.replace('&amp;','&')
        #unescape double escaping in links
        def inner_a_handler(m):
            l = m.group(1)
            return '>%s</a>' % l.replace('&amp;','&')
        # remove the "&" escaping in urls
        text = href_re.sub(href_handler, text)
        text = code_re.sub(code_handler, text)
        text = a_re.sub(inner_a_handler, text)
        return MD_START + text + MD_END if div else text

def keep_space(text):
    text = websafe(text)
    for i in " \n\r\t":
        text=text.replace(i,'&#%02d;' % ord(i))
    return unsafe(text)

def unkeep_space(text):
    return text.replace('&#32;', ' ').replace('&#10;', '\n').replace('&#09;', '\t')

whitespace_re = re.compile('^\s*$')
def killhtml(html=''):
    html_doc = soupparser.fromstring(remove_control_chars(html))
    text = filter(lambda text: not whitespace_re.match(text), html_doc.itertext())
    cleaned_html = ' '.join([fragment.strip() for fragment in text])
    return cleaned_html

control_chars = re.compile('[\x00-\x08\x0b\x0c\x0e-\x1f]')   # Control characters *except* \t \r \n
def remove_control_chars(text):
    return control_chars.sub('',text)

def cleanhtml(html='', cleaner=None):
    html_doc = soupparser.fromstring(remove_control_chars(html))
    if not cleaner:
        cleaner = sanitizer
    cleaned_html = cleaner.clean_html(html_doc)
    return lxml.html.tostring(autolink_html(cleaned_html))

def clean_comment_html(html=''):
    return cleanhtml(html, comment_sanitizer)

block_tags = r'h1|h2|h3|h4|h5|h6|table|ol|dl|ul|menu|dir|p|pre|center|form|fieldset|select|blockquote|address|div|hr'
linebreaks_re = re.compile(r'(\n{2}|\r{2}|(?:\r\n){2}|</?(?:%s)[^>]*?>)' % block_tags)
tags_re = re.compile(r'</?(?:%s)' % block_tags)
def format_linebreaks(html=''):
    paragraphs = ['<p>%s</p>' % p if not tags_re.match(p) else p
                  for p in linebreaks_re.split(html.strip())
                  if not whitespace_re.match(p)]
    return ''.join(paragraphs)
Back to Top