/wikimarkup/parser.py
Python | 2379 lines | 2354 code | 5 blank | 20 comment | 1 complexity | 0ffbecd270a36dd9c0e3a068e5739438 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- # coding: latin1
- """
- MediaWiki-style markup
- Copyright (C) 2008 David Cramer <dcramer@gmail.com>
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
- """
- import re, random, locale
- from base64 import b64encode, b64decode
- # a few patterns we use later
- MW_COLON_STATE_TEXT = 0
- MW_COLON_STATE_TAG = 1
- MW_COLON_STATE_TAGSTART = 2
- MW_COLON_STATE_CLOSETAG = 3
- MW_COLON_STATE_TAGSLASH = 4
- MW_COLON_STATE_COMMENT = 5
- MW_COLON_STATE_COMMENTDASH = 6
- MW_COLON_STATE_COMMENTDASHDASH = 7
- _attributePat = re.compile(ur'''(?:^|\s)([A-Za-z0-9]+)(?:\s*=\s*(?:\"([^<\"]*)\"|\'([^<\']*)\'|([a-zA-Z0-9!#$%&()*,\-./:;<>?@\[\]^_{|}~]+)|#([0-9a-fA-F]+)))''', re.UNICODE)
- _space = re.compile(ur'\s+', re.UNICODE)
- _closePrePat = re.compile(u"</pre", re.UNICODE | re.IGNORECASE)
- _openPrePat = re.compile(u"<pre", re.UNICODE | re.IGNORECASE)
- _openMatchPat = re.compile(u"(<table|<blockquote|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|<p|<ul|<ol|<li|</center|</tr|</td|</th)", re.UNICODE | re.IGNORECASE)
- _tagPattern = re.compile(ur'^(/?)(\w+)([^>]*?)(/?>)([^<]*)$', re.UNICODE)
- _htmlpairs = ( # Tags that must be closed
- u'b', u'del', u'i', u'ins', u'u', u'font', u'big', u'small', u'sub', u'sup', u'h1',
- u'h2', u'h3', u'h4', u'h5', u'h6', u'cite', u'code', u'em', u's',
- u'strike', u'strong', u'tt', u'var', u'div', u'center',
- u'blockquote', u'ol', u'ul', u'dl', u'table', u'caption', u'pre',
- u'ruby', u'rt' , u'rb' , u'rp', u'p', u'span', u'u',
- )
- _htmlsingle = (
- u'br', u'hr', u'li', u'dt', u'dd', u'img',
- )
- _htmlsingleonly = ( # Elements that cannot have close tags
- u'br', u'hr', u'img',
- )
- _htmlnest = ( # Tags that can be nested--??
- u'table', u'tr', u'td', u'th', u'div', u'blockquote', u'ol', u'ul',
- u'dl', u'font', u'big', u'small', u'sub', u'sup', u'span', u'img',
- )
- _tabletags = ( # Can only appear inside table
- u'td', u'th', u'tr',
- )
- _htmllist = ( # Tags used by list
- u'ul', u'ol',
- )
- _listtags = ( # Tags that can appear in a list
- u'li',
- )
- _htmlsingleallowed = _htmlsingle + _tabletags
- _htmlelements = _htmlsingle + _htmlpairs + _htmlnest
- _htmlEntities = {
- u'Aacute': 193, u'aacute': 225, u'Acirc': 194, u'acirc': 226, u'acute': 180,
- u'AElig': 198, u'aelig': 230, u'Agrave': 192, u'agrave': 224, u'alefsym': 8501,
- u'Alpha': 913, u'alpha': 945, u'amp': 38, u'and': 8743, u'ang': 8736, u'Aring': 197,
- u'aring': 229,
- u'asymp': 8776,
- u'Atilde': 195,
- u'atilde': 227,
- u'Auml': 196,
- u'auml': 228,
- u'bdquo': 8222,
- u'Beta': 914,
- u'beta': 946,
- u'brvbar': 166,
- u'bull': 8226,
- u'cap': 8745,
- u'Ccedil': 199,
- u'ccedil': 231,
- u'cedil': 184,
- u'cent': 162,
- u'Chi': 935,
- u'chi': 967,
- u'circ': 710,
- u'clubs': 9827,
- u'cong': 8773,
- u'copy': 169,
- u'crarr': 8629,
- u'cup': 8746,
- u'curren': 164,
- u'dagger': 8224,
- u'Dagger': 8225,
- u'darr': 8595,
- u'dArr': 8659,
- u'deg': 176,
- u'Delta': 916,
- u'delta': 948,
- u'diams': 9830,
- u'divide': 247,
- u'Eacute': 201,
- u'eacute': 233,
- u'Ecirc': 202,
- u'ecirc': 234,
- u'Egrave': 200,
- u'egrave': 232,
- u'empty': 8709,
- u'emsp': 8195,
- u'ensp': 8194,
- u'Epsilon': 917,
- u'epsilon': 949,
- u'equiv': 8801,
- u'Eta': 919,
- u'eta': 951,
- u'ETH': 208,
- u'eth': 240,
- u'Euml': 203,
- u'euml': 235,
- u'euro': 8364,
- u'exist': 8707,
- u'fnof': 402,
- u'forall': 8704,
- u'frac12': 189,
- u'frac14': 188,
- u'frac34': 190,
- u'frasl': 8260,
- u'Gamma': 915,
- u'gamma': 947,
- u'ge': 8805,
- u'gt': 62,
- u'harr': 8596,
- u'hArr': 8660,
- u'hearts': 9829,
- u'hellip': 8230,
- u'Iacute': 205,
- u'iacute': 237,
- u'Icirc': 206,
- u'icirc': 238,
- u'iexcl': 161,
- u'Igrave': 204,
- u'igrave': 236,
- u'image': 8465,
- u'infin': 8734,
- u'int': 8747,
- u'Iota': 921,
- u'iota': 953,
- u'iquest': 191,
- u'isin': 8712,
- u'Iuml': 207,
- u'iuml': 239,
- u'Kappa': 922,
- u'kappa': 954,
- u'Lambda': 923,
- u'lambda': 955,
- u'lang': 9001,
- u'laquo': 171,
- u'larr': 8592,
- u'lArr': 8656,
- u'lceil': 8968,
- u'ldquo': 8220,
- u'le': 8804,
- u'lfloor': 8970,
- u'lowast': 8727,
- u'loz': 9674,
- u'lrm': 8206,
- u'lsaquo': 8249,
- u'lsquo': 8216,
- u'lt': 60,
- u'macr': 175,
- u'mdash': 8212,
- u'micro': 181,
- u'middot': 183,
- u'minus': 8722,
- u'Mu': 924,
- u'mu': 956,
- u'nabla': 8711,
- u'nbsp': 160,
- u'ndash': 8211,
- u'ne': 8800,
- u'ni': 8715,
- u'not': 172,
- u'notin': 8713,
- u'nsub': 8836,
- u'Ntilde': 209,
- u'ntilde': 241,
- u'Nu': 925,
- u'nu': 957,
- u'Oacute': 211,
- u'oacute': 243,
- u'Ocirc': 212,
- u'ocirc': 244,
- u'OElig': 338,
- u'oelig': 339,
- u'Ograve': 210,
- u'ograve': 242,
- u'oline': 8254,
- u'Omega': 937,
- u'omega': 969,
- u'Omicron': 927,
- u'omicron': 959,
- u'oplus': 8853,
- u'or': 8744,
- u'ordf': 170,
- u'ordm': 186,
- u'Oslash': 216,
- u'oslash': 248,
- u'Otilde': 213,
- u'otilde': 245,
- u'otimes': 8855,
- u'Ouml': 214,
- u'ouml': 246,
- u'para': 182,
- u'part': 8706,
- u'permil': 8240,
- u'perp': 8869,
- u'Phi': 934,
- u'phi': 966,
- u'Pi': 928,
- u'pi': 960,
- u'piv': 982,
- u'plusmn': 177,
- u'pound': 163,
- u'prime': 8242,
- u'Prime': 8243,
- u'prod': 8719,
- u'prop': 8733,
- u'Psi': 936,
- u'psi': 968,
- u'quot': 34,
- u'radic': 8730,
- u'rang': 9002,
- u'raquo': 187,
- u'rarr': 8594,
- u'rArr': 8658,
- u'rceil': 8969,
- u'rdquo': 8221,
- u'real': 8476,
- u'reg': 174,
- u'rfloor': 8971,
- u'Rho': 929,
- u'rho': 961,
- u'rlm': 8207,
- u'rsaquo': 8250,
- u'rsquo': 8217,
- u'sbquo': 8218,
- u'Scaron': 352,
- u'scaron': 353,
- u'sdot': 8901,
- u'sect': 167,
- u'shy': 173,
- u'Sigma': 931,
- u'sigma': 963,
- u'sigmaf': 962,
- u'sim': 8764,
- u'spades': 9824,
- u'sub': 8834,
- u'sube': 8838,
- u'sum': 8721,
- u'sup': 8835,
- u'sup1': 185,
- u'sup2': 178,
- u'sup3': 179,
- u'supe': 8839,
- u'szlig': 223,
- u'Tau': 932,
- u'tau': 964,
- u'there4': 8756,
- u'Theta': 920,
- u'theta': 952,
- u'thetasym': 977,
- u'thinsp': 8201,
- u'THORN': 222,
- u'thorn': 254,
- u'tilde': 732,
- u'times': 215,
- u'trade': 8482,
- u'Uacute': 218,
- u'uacute': 250,
- u'uarr': 8593,
- u'uArr': 8657,
- u'Ucirc': 219,
- u'ucirc': 251,
- u'Ugrave': 217,
- u'ugrave': 249,
- u'uml': 168,
- u'upsih': 978,
- u'Upsilon': 933,
- u'upsilon': 965,
- u'Uuml': 220,
- u'uuml': 252,
- u'weierp': 8472,
- u'Xi': 926,
- u'xi': 958,
- u'Yacute': 221,
- u'yacute': 253,
- u'yen': 165,
- u'Yuml': 376,
- u'yuml': 255,
- u'Zeta': 918,
- u'zeta': 950,
- u'zwj': 8205,
- u'zwnj': 8204
- }
- _charRefsPat = re.compile(ur'''(&([A-Za-z0-9]+);|&#([0-9]+);|&#[xX]([0-9A-Za-z]+);|(&))''', re.UNICODE)
- _cssCommentPat = re.compile(ur'''\*.*?\*''', re.UNICODE)
- _toUTFPat = re.compile(ur'''\\([0-9A-Fa-f]{1,6})[\s]?''', re.UNICODE)
- _hackPat = re.compile(ur'''(expression|tps*://|url\s*\().*''', re.UNICODE | re.IGNORECASE)
- _hrPat = re.compile(u'''^-----*''', re.UNICODE | re.MULTILINE)
- _h1Pat = re.compile(u'^=(.+)=\s*$', re.UNICODE | re.MULTILINE)
- _h2Pat = re.compile(u'^==(.+)==\s*$', re.UNICODE | re.MULTILINE)
- _h3Pat = re.compile(u'^===(.+)===\s*$', re.UNICODE | re.MULTILINE)
- _h4Pat = re.compile(u'^====(.+)====\s*$', re.UNICODE | re.MULTILINE)
- _h5Pat = re.compile(u'^=====(.+)=====\s*$', re.UNICODE | re.MULTILINE)
- _h6Pat = re.compile(u'^======(.+)======\s*$', re.UNICODE | re.MULTILINE)
- _quotePat = re.compile(u"""(''+)""", re.UNICODE)
- _removePat = re.compile(ur'\b(' + ur'|'.join((u"a", u"an", u"as", u"at", u"before", u"but", u"by", u"for", u"from",
- u"is", u"in", u"into", u"like", u"of", u"off", u"on", u"onto", u"per",
- u"since", u"than", u"the", u"this", u"that", u"to", u"up", u"via",
- u"with")) + ur')\b', re.UNICODE | re.IGNORECASE)
- _nonWordSpaceDashPat = re.compile(ur'[^\w\s\-\./]', re.UNICODE)
- _multiSpacePat = re.compile(ur'[\s\-_\./]+', re.UNICODE)
- _spacePat = re.compile(ur' ', re.UNICODE)
- _linkPat = re.compile(ur'^(?:([A-Za-z0-9]+):)?([^\|]+)(?:\|([^\n]+?))?\]\](.*)$', re.UNICODE | re.DOTALL)
- _bracketedLinkPat = re.compile(ur'(?:\[((?:mailto:|git://|irc://|https?://|ftp://|/)[^<>\]\[' + u"\x00-\x20\x7f" + ur']*)\s*(.*?)\])', re.UNICODE)
- _internalLinkPat = re.compile(ur'\[\[(?:(:?[^:\]]*?):\s*)?(.*?)\]\]')
- _protocolPat = re.compile(ur'(\b(?:mailto:|irc://|https?://|ftp://))', re.UNICODE)
- _specialUrlPat = re.compile(ur'^([^<>\]\[' + u"\x00-\x20\x7f" + ur']+)(.*)$', re.UNICODE)
- _protocolsPat = re.compile(ur'^(mailto:|irc://|https?://|ftp://)$', re.UNICODE)
- _controlCharsPat = re.compile(ur'[\]\[<>"' + u"\\x00-\\x20\\x7F" + ur']]', re.UNICODE)
- _hostnamePat = re.compile(ur'^([^:]+:)(//[^/]+)?(.*)$', re.UNICODE)
- _stripPat = re.compile(u'\\s|\u00ad|\u1806|\u200b|\u2060|\ufeff|\u03f4|\u034f|\u180b|\u180c|\u180d|\u200c|\u200d|[\ufe00-\ufe0f]', re.UNICODE)
- _zomgPat = re.compile(ur'^(:*)\{\|(.*)$', re.UNICODE)
- _headerPat = re.compile(ur"<[Hh]([1-6])(.*?)>(.*?)</[Hh][1-6] *>", re.UNICODE)
- _templateSectionPat = re.compile(ur"<!--MWTEMPLATESECTION=([^&]+)&([^_]+)-->", re.UNICODE)
- _tagPat = re.compile(ur"<.*?>", re.UNICODE)
- _startRegexHash = {}
- _endRegexHash = {}
- _endCommentPat = re.compile(ur'(-->)', re.UNICODE)
- _extractTagsAndParams_n = 1
- _guillemetLeftPat = re.compile(ur'(.) (\?|:|;|!|\302\273)', re.UNICODE)
- _guillemetRightPat = re.compile(ur'(\302\253) ', re.UNICODE)
- def setupAttributeWhitelist():
- common = ( u'id', u'class', u'lang', u'dir', u'title', u'style' )
- block = common + (u'align',)
- tablealign = ( u'align', u'char', u'charoff', u'valign' )
- tablecell = ( u'abbr',
- u'axis',
- u'headers',
- u'scope',
- u'rowspan',
- u'colspan',
- u'nowrap', # deprecated
- u'width', # deprecated
- u'height', # deprecated
- u'bgcolor' # deprecated
- )
- return {
- u'div': block,
- u'center': common, # deprecated
- u'span': block, # ??
- u'h1': block,
- u'h2': block,
- u'h3': block,
- u'h4': block,
- u'h5': block,
- u'h6': block,
- u'em': common,
- u'strong': common,
- u'cite': common,
- u'code': common,
- u'var': common,
- u'img': common + (u'src', u'alt', u'width', u'height',),
- u'blockquote': common + (u'cite',),
- u'sub': common,
- u'sup': common,
- u'p': block,
- u'br': (u'id', u'class', u'title', u'style', u'clear',),
- u'pre': common + (u'width',),
- u'ins': common + (u'cite', u'datetime'),
- u'del': common + (u'cite', u'datetime'),
- u'ul': common + (u'type',),
- u'ol': common + (u'type', u'start'),
- u'li': common + (u'type', u'value'),
- u'dl': common,
- u'dd': common,
- u'dt': common,
- u'table': common + ( u'summary', u'width', u'border', u'frame',
- u'rules', u'cellspacing', u'cellpadding',
- u'align', u'bgcolor',
- ),
- u'caption': common + (u'align',),
- u'thead': common + tablealign,
- u'tfoot': common + tablealign,
- u'tbody': common + tablealign,
- u'colgroup': common + ( u'span', u'width' ) + tablealign,
- u'col': common + ( u'span', u'width' ) + tablealign,
- u'tr': common + ( u'bgcolor', ) + tablealign,
- u'td': common + tablecell + tablealign,
- u'th': common + tablecell + tablealign,
- u'tt': common,
- u'b': common,
- u'i': common,
- u'big': common,
- u'small': common,
- u'strike': common,
- u's': common,
- u'u': common,
- u'font': common + ( u'size', u'color', u'face' ),
- u'hr': common + ( u'noshade', u'size', u'width' ),
- u'ruby': common,
- u'rb': common,
- u'rt': common, #array_merge( $common, array( 'rbspan' ) ),
- u'rp': common,
- }
- _whitelist = setupAttributeWhitelist()
- _page_cache = {}
- env = {}
- def registerTagHook(tag, function):
- mTagHooks[tag] = function
- def registerInternalLinkHook(tag, function):
- """
- Register a hook called for [[internal links]]. There is no default
- handling for internal links.
- def internalLinkHook(parser_env, namespace, body):
- ...
- return replacement
- registerInternalLinkHook(None, internalLinkHook) # called for [[link]]
- registerInternalLinkHook('Wikipedia', internalLinkHook) # called for [[Wikipedia:link]]
- registerInternalLinkHook(':en', internalLinkHook) # called for [[:en:link]]
- registerInternalLinkHook(':', internalLinkHook) # called for [[:any:link]] not hooked above
- registerInternalLinkHook('*', internalLinkHook) # called for [[anything]] not hooked above
- """
- mInternalLinkHooks[tag] = function
- class BaseParser(object):
- def __init__(self):
- self.uniq_prefix = u"\x07UNIQ" + unicode(random.randint(1, 1000000000))
- self.strip_state = {}
- self.arg_stack = []
- self.env = env
- self.keep_env = (env != {})
-
- def __del__(self):
- if not self.keep_env:
- global env
- env = {}
- def store_object(self, namespace, key, value=True):
- """
- Used to store objects in the environment
- which assists in preventing recursive imports.
- """
- # Store the item to not reprocess it
- if namespace not in self.env:
- self.env[namespace] = {}
- self.env[namespace][key] = value
- def has_object(self, namespace, key):
- if namespace not in self.env:
- self.env[namespace] = {}
- if hasattr(self, 'count'):
- data = self.env[namespace]
- test = key in data
- ls
- self.count = True
- return key in self.env[namespace]
- def retrieve_object(self, namespace, key, default=None):
- if not self.env.get(namespace):
- self.env[namespace] = {}
- return self.env[namespace].get(key, default)
- def parse(self, text):
- utf8 = isinstance(text, str)
- text = to_unicode(text)
- if text[-1:] != u'\n':
- text = text + u'\n'
- taggedNewline = True
- else:
- taggedNewline = False
- text = self.strip(text)
- text = self.removeHtmlTags(text)
- text = self.parseHorizontalRule(text)
- text = self.parseAllQuotes(text)
- text = self.replaceExternalLinks(text)
- text = self.replaceInternalLinks(text)
- text = self.unstrip(text)
- text = self.fixtags(text)
- text = self.doBlockLevels(text, True)
- text = self.unstripNoWiki(text)
- text = text.split(u'\n')
- text = u'\n'.join(text)
- if taggedNewline and text[-1:] == u'\n':
- text = text[:-1]
- if utf8:
- return text.encode("utf-8")
- return text
- def strip(self, text, stripcomments=False, dontstrip=[]):
- render = True
- commentState = {}
- elements = ['nowiki',] + mTagHooks.keys()
- if True: #wgRawHtml
- elements.append('html')
- # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
- for k in dontstrip:
- if k in elements:
- del elements[k]
- matches = {}
- text = self.extractTagsAndParams(elements, text, matches)
- for marker in matches:
- element, content, params, tag = matches[marker]
- if render:
- tagName = element.lower()
- if tagName == u'!--':
- # comment
- output = tag
- if tag[-3:] != u'-->':
- output += "-->"
- elif tagName == u'html':
- output = content
- elif tagName == u'nowiki':
- output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
- else:
- if tagName in mTagHooks:
- output = mTagHooks[tagName](self, content, params)
- else:
- output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
- else:
- # Just stripping tags; keep the source
- output = tag
- # Unstrip the output, because unstrip() is no longer recursive so
- # it won't do it itself
- output = self.unstrip(output)
- if not stripcomments and element == u'!--':
- commentState[marker] = output
- elif element == u'html' or element == u'nowiki':
- if 'nowiki' not in self.strip_state:
- self.strip_state['nowiki'] = {}
- self.strip_state['nowiki'][marker] = output
- else:
- if 'general' not in self.strip_state:
- self.strip_state['general'] = {}
- self.strip_state['general'][marker] = output
- # Unstrip comments unless explicitly told otherwise.
- # (The comments are always stripped prior to this point, so as to
- # not invoke any extension tags / parser hooks contained within
- # a comment.)
- if not stripcomments:
- # Put them all back and forget them
- for k in commentState:
- v = commentState[k]
- text = text.replace(k, v)
- return text
- def removeHtmlTags(self, text):
- """convert bad tags into HTML identities"""
- sb = []
- text = self.removeHtmlComments(text)
- bits = text.split(u'<')
- sb.append(bits.pop(0))
- tagstack = []
- tablestack = tagstack
- for x in bits:
- m = _tagPattern.match(x)
- if not m:
- continue
- slash, t, params, brace, rest = m.groups()
- t = t.lower()
- badtag = False
- if t in _htmlelements:
- # Check our stack
- if slash:
- # Closing a tag...
- if t in _htmlsingleonly or len(tagstack) == 0:
- badtag = True
- else:
- ot = tagstack.pop()
- if ot != t:
- if ot in _htmlsingleallowed:
- # Pop all elements with an optional close tag
- # and see if we find a match below them
- optstack = []
- optstack.append(ot)
- while True:
- if len(tagstack) == 0:
- break
- ot = tagstack.pop()
- if ot == t or ot not in _htmlsingleallowed:
- break
- optstack.append(ot)
- if t != ot:
- # No match. Push the optinal elements back again
- badtag = True
- tagstack += reversed(optstack)
- else:
- tagstack.append(ot)
- # <li> can be nested in <ul> or <ol>, skip those cases:
- if ot not in _htmllist and t in _listtags:
- badtag = True
- elif t == u'table':
- if len(tablestack) == 0:
- bagtag = True
- else:
- tagstack = tablestack.pop()
- newparams = u''
- else:
- # Keep track for later
- if t in _tabletags and u'table' not in tagstack:
- badtag = True
- elif t in tagstack and t not in _htmlnest:
- badtag = True
- # Is it a self-closed htmlpair? (bug 5487)
- elif brace == u'/>' and t in _htmlpairs:
- badTag = True
- elif t in _htmlsingleonly:
- # Hack to force empty tag for uncloseable elements
- brace = u'/>'
- elif t in _htmlsingle:
- # Hack to not close $htmlsingle tags
- brace = None
- else:
- if t == u'table':
- tablestack.append(tagstack)
- tagstack = []
- tagstack.append(t)
- newparams = self.fixTagAttributes(params, t)
- if not badtag:
- rest = rest.replace(u'>', u'>')
- if brace == u'/>':
- close = u' /'
- else:
- close = u''
- sb.append(u'<')
- sb.append(slash)
- sb.append(t)
- sb.append(newparams)
- sb.append(close)
- sb.append(u'>')
- sb.append(rest)
- continue
- sb.append(u'<')
- sb.append(x.replace(u'>', u'>'))
- # Close off any remaining tags
- while tagstack:
- t = tagstack.pop()
- sb.append(u'</')
- sb.append(t)
- sb.append(u'>\n')
- if t == u'table':
- if not tablestack:
- break
- tagstack = tablestack.pop()
- return u''.join(sb)
- def removeHtmlComments(self, text):
- """remove <!-- text --> comments from given text"""
- sb = []
- start = text.find(u'<!--')
- last = 0
- while start != -1:
- end = text.find(u'-->', start)
- if end == -1:
- break
- end += 3
-
- spaceStart = max(0, start-1)
- spaceEnd = end
- while text[spaceStart] == u' ' and spaceStart > 0:
- spaceStart -= 1
- while text[spaceEnd] == u' ':
- spaceEnd += 1
-
- if text[spaceStart] == u'\n' and text[spaceEnd] == u'\n':
- sb.append(text[last:spaceStart])
- sb.append(u'\n')
- last = spaceEnd+1
- else:
- sb.append(text[last:spaceStart+1])
- last = spaceEnd
-
- start = text.find(u'<!--', end)
- sb.append(text[last:])
- return u''.join(sb)
- def decodeTagAttributes(self, text):
- """docstring for decodeTagAttributes"""
- attribs = {}
- if text.strip() == u'':
- return attribs
- scanner = _attributePat.scanner(text)
- match = scanner.search()
- while match:
- key, val1, val2, val3, val4 = match.groups()
- value = val1 or val2 or val3 or val4
- if value:
- value = _space.sub(u' ', value).strip()
- else:
- value = ''
- attribs[key] = self.decodeCharReferences(value)
-
- match = scanner.search()
- return attribs
- def validateTagAttributes(self, attribs, element):
- """docstring for validateTagAttributes"""
- out = {}
- if element not in _whitelist:
- return out
- whitelist = _whitelist[element]
- for attribute in attribs:
- value = attribs[attribute]
- if attribute not in whitelist:
- continue
- # Strip javascript "expression" from stylesheets.
- # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
- if attribute == u'style':
- value = self.checkCss(value)
- if value == False:
- continue
- elif attribute == u'id':
- value = self.escapeId(value)
- # If this attribute was previously set, override it.
- # Output should only have one attribute of each name.
- out[attribute] = value
- return out
- def safeEncodeAttribute(self, encValue):
- """docstring for safeEncodeAttribute"""
- encValue = encValue.replace(u'&', u'&')
- encValue = encValue.replace(u'<', u'<')
- encValue = encValue.replace(u'>', u'>')
- encValue = encValue.replace(u'"', u'"')
- encValue = encValue.replace(u'{', u'{')
- encValue = encValue.replace(u'[', u'[')
- encValue = encValue.replace(u"''", u'''')
- encValue = encValue.replace(u'ISBN', u'ISBN')
- encValue = encValue.replace(u'RFC', u'RFC')
- encValue = encValue.replace(u'PMID', u'PMID')
- encValue = encValue.replace(u'|', u'|')
- encValue = encValue.replace(u'__', u'__')
- encValue = encValue.replace(u'\n', u' ')
- encValue = encValue.replace(u'\r', u' ')
- encValue = encValue.replace(u'\t', u'	')
- return encValue
- def fixTagAttributes(self, text, element):
- if text.strip() == u'':
- return u''
-
- stripped = self.validateTagAttributes(self.decodeTagAttributes(text), element)
-
- sb = []
-
- for attribute in stripped:
- value = stripped[attribute]
- encAttribute = attribute.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
- encValue = self.safeEncodeAttribute(value)
-
- sb.append(u' ')
- sb.append(encAttribute)
- sb.append(u'="')
- sb.append(encValue)
- sb.append(u'"')
-
- return u''.join(sb)
- def validateCodepoint(self, codepoint):
- return codepoint == 0x09 \
- or codepoint == 0x0a \
- or codepoint == 0x0d \
- or (codepoint >= 0x20 and codepoint <= 0xd7ff) \
- or (codepoint >= 0xe000 and codepoint <= 0xfffd) \
- or (codepoint >= 0x10000 and codepoint <= 0x10ffff)
- def _normalizeCallback(self, match):
- text, norm, dec, hexval, _ = match.groups()
- if norm:
- sb = []
- sb.append(u'&')
- if norm not in _htmlEntities:
- sb.append(u'amp;')
- sb.append(norm)
- sb.append(u';')
- return u''.join(sb)
- elif dec:
- dec = int(dec)
- if self.validateCodepoint(dec):
- sb = []
- sb.append(u'&#')
- sb.append(dec)
- sb.append(u';')
- return u''.join(sb)
- elif hexval:
- hexval = int(hexval, 16)
- if self.validateCodepoint(hexval):
- sb = []
- sb.append(u'&#x')
- sb.append(hex(hexval))
- sb.append(u';')
- return u''.join(sb)
- return text.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
- def normalizeCharReferences(self, text):
- """docstring for normalizeCharReferences"""
- return _charRefsPat.sub(self._normalizeCallback, text)
- def _decodeCallback(self, match):
- text, norm, dec, hexval, _ = match.groups()
- if norm:
- if norm in _htmlEntities:
- return unichr(_htmlEntities[norm])
- else:
- sb = []
- sb.append(u'&')
- sb.append(norm)
- sb.append(u';')
- return u''.join(sb)
- elif dec:
- dec = int(dec)
- if self.validateCodepoint(dec):
- return unichr(dec)
- return u'?'
- elif hexval:
- hexval = int(hexval, 16)
- if self.validateCodepoint(dec):
- return unichr(dec)
- return u'?'
- return text
- def decodeCharReferences(self, text):
- """docstring for decodeCharReferences"""
- if text:
- return _charRefsPat.sub(self._decodeCallback, text)
- return ''
- def _convertToUtf8(self, s):
- return unichr(int(s.group(1), 16))
- def checkCss(self, value):
- """docstring for checkCss"""
- stripped = self.decodeCharReferences(value)
-
- stripped = _cssCommentPat.sub(u'', stripped)
- value = stripped
-
- stripped = _toUTFPat.sub(self._convertToUtf8, stripped)
- stripped.replace(u'\\', u'')
- if _hackPat.search(stripped):
- # someone is haxx0ring
- return False
-
- return value
- def escapeId(self, value):
- """docstring for escapeId"""
- # TODO
- return safe_name(value)
- def parseHorizontalRule(self, text):
- return _hrPat.sub(ur'<hr />', text)
- def parseHeaders(self, text):
- text = _h6Pat.sub(ur'<h6>\1</h6>', text)
- text = _h5Pat.sub(ur'<h5>\1</h5>', text)
- text = _h4Pat.sub(ur'<h4>\1</h4>', text)
- text = _h3Pat.sub(ur'<h3>\1</h3>', text)
- text = _h2Pat.sub(ur'<h2>\1</h2>', text)
- text = _h1Pat.sub(ur'<h1>\1</h1>', text)
- return text
- def parseQuotes(self, text):
- arr = _quotePat.split(text)
- if len(arr) == 1:
- return text
- # First, do some preliminary work. This may shift some apostrophes from
- # being mark-up to being text. It also counts the number of occurrences
- # of bold and italics mark-ups.
- numBold = 0
- numItalics = 0
- for i,r in zip(range(len(arr)), arr):
- if i%2 == 1:
- l = len(r)
- if l == 4:
- arr[i-1] += u"'"
- arr[i] = u"'''"
- elif l > 5:
- arr[i-1] += u"'" * (len(arr[i]) - 5)
- arr[i] = u"'''''"
- if l == 2:
- numItalics += 1
- elif l >= 5:
- numItalics += 1
- numBold += 1
- else:
- numBold += 1
-
- # If there is an odd number of both bold and italics, it is likely
- # that one of the bold ones was meant to be an apostrophe followed
- # by italics. Which one we cannot know for certain, but it is more
- # likely to be one that has a single-letter word before it.
- if numBold%2 == 1 and numItalics%2 == 1:
- firstSingleLetterWord = -1
- firstMultiLetterWord = -1
- firstSpace = -1
- for i,r in zip(range(len(arr)), arr):
- if i%2 == 1 and len(r) == 3:
- x1 = arr[i-1][-1:]
- x2 = arr[i-1][-2:-1]
- if x1 == u' ':
- if firstSpace == -1:
- firstSpace = i
- elif x2 == u' ':
- if firstSingleLetterWord == -1:
- firstSingleLetterWord = i
- else:
- if firstMultiLetterWord == -1:
- firstMultiLetterWord = i
-
- # If there is a single-letter word, use it!
- if firstSingleLetterWord > -1:
- arr[firstSingleLetterWord] = u"''"
- arr[firstSingleLetterWord-1] += u"'"
- # If not, but there's a multi-letter word, use that one.
- elif firstMultiLetterWord > -1:
- arr[firstMultiLetterWord] = u"''"
- arr[firstMultiLetterWord-1] += u"'"
- # ... otherwise use the first one that has neither.
- # (notice that it is possible for all three to be -1 if, for example,
- # there is only one pentuple-apostrophe in the line)
- elif firstSpace > -1:
- arr[firstSpace] = u"''"
- arr[firstSpace-1] += u"'"
-
- # Now let's actually convert our apostrophic mush to HTML!
- output = []
- buffer = None
- state = ''
- for i,r in zip(range(len(arr)), arr):
- if i%2 == 0:
- if state == 'both':
- buffer.append(r)
- else:
- output.append(r)
- else:
- if len(r) == 2:
- if state == 'i':
- output.append(u"</em>")
- state = ''
- elif state == 'bi':
- output.append(u"</em>")
- state = 'b'
- elif state == 'ib':
- output.append(u"</strong></em><strong>")
- state = 'b'
- elif state == 'both':
- output.append(u"<strong><em>")
- output.append(u''.join(buffer))
- buffer = None
- output.append(u"</em>")
- state = 'b'
- elif state == 'b':
- output.append(u"<em>")
- state = 'bi'
- else: # ''
- output.append(u"<em>")
- state = 'i'
- elif len(r) == 3:
- if state == 'b':
- output.append(u"</strong>")
- state = ''
- elif state == 'bi':
- output.append(u"</em></strong><em>")
- state = 'i'
- elif state == 'ib':
- output.append(u"</strong>")
- state = 'i'
- elif state == 'both':
- output.append(u"<em><strong>")
- output.append(u''.join(buffer))
- buffer = None
- output.append(u"</strong>")
- state = 'i'
- elif state == 'i':
- output.append(u"<strong>")
- state = 'ib'
- else: # ''
- output.append(u"<strong>")
- state = 'b'
- elif len(r) == 5:
- if state == 'b':
- output.append(u"</strong><em>")
- state = 'i'
- elif state == 'i':
- output.append(u"</em><strong>")
- state = 'b'
- elif state == 'bi':
- output.append(u"</em></strong>")
- state = ''
- elif state == 'ib':
- output.append(u"</strong></em>")
- state = ''
- elif state == 'both':
- output.append(u"<em><strong>")
- output.append(u''.join(buffer))
- buffer = None
- output.append(u"</strong></em>")
- state = ''
- else: # ''
- buffer = []
- state = 'both'
-
- if state == 'both':
- output.append(u"<em><strong>")
- output.append(u''.join(buffer))
- buffer = None
- output.append(u"</strong></em>")
- elif state != '':
- if state == 'b' or state == 'ib':
- output.append(u"</strong>")
- if state == 'i' or state == 'bi' or state == 'ib':
- output.append(u"</em>")
- if state == 'bi':
- output.append(u"</strong>")
- return u''.join(output)
- def parseAllQuotes(self, text):
- sb = []
- lines = text.split(u'\n')
- first = True
- for line in lines:
- if not first:
- sb.append(u'\n')
- else:
- first = False
- sb.append(self.parseQuotes(line))
- return u''.join(sb)
- def replaceExternalLinks(self, text):
- sb = []
- bits = _bracketedLinkPat.split(text)
- l = len(bits)
- i = 0
- num_links = 0
- while i < l:
- if i%3 == 0:
- #sb.append(self.replaceFreeExternalLinks(bits[i]))
- sb.append(bits[i])
- i += 1
- else:
- sb.append(u'<a href="')
- sb.append(bits[i])
- sb.append(u'">')
- if not bits[i+1]:
- num_links += 1
- sb.append(to_unicode(truncate_url(bits[i])))
- else:
- sb.append(bits[i+1])
- sb.append(u'</a>')
- i += 2
- return ''.join(sb)
- def replaceInternalLinks(self, text):
- sb = []
- # [[x]] -> (None, 'x')
- # [[type:x]] -> ('type','x')
- # [[:type:x]] -> (':type','x')
- bits = _internalLinkPat.split(text)
- l = len(bits)
- i = 0
- num_links = 0
- while i < l:
- if i%3 == 0:
- sb.append(bits[i])
- i += 1
- else:
- space, name = bits[i:i+2]
- if space in mInternalLinkHooks:
- sb.append(mInternalLinkHooks[space](self, space, name))
- elif space and space.startswith(':') and ':' in mInternalLinkHooks:
- sb.append(mInternalLinkHooks[':'](self, space, name))
- elif '*' in mInternalLinkHooks:
- sb.append(mInternalLinkHooks['*'](self, space, name))
- elif bits[i]:
- sb.append(u'[[%s:%s]]' % (bits[i], bits[i+1]))
- else:
- sb.append(u'[[%s]]' % bits[i+1])
- i += 2
- return ''.join(sb)
- # TODO: fix this so it actually works
- def replaceFreeExternalLinks(self, text):
- bits = _protocolPat.split(text)
- sb = [bits.pop(0)]
- i = 0
- l = len(bits)
- while i < l:
- protocol = bits[i]
- remainder = bits[i+1]
- i += 2
- match = _specialUrlPat.match(remainder)
- if match:
- # Found some characters after the protocol that look promising
- url = protocol + match.group(1)
- trail = match.group(2)
-
- # special case: handle urls as url args:
- # http://www.example.com/foo?=http://www.example.com/bar
- if len(trail) == 0 and len(bits) > i and _protocolsPat.match(bits[i]):
- match = _specialUrlPat.match(remainder)
- if match:
- url += bits[i] + match.group(1)
- i += 2
- trail = match.group(2)
-
- # The characters '<' and '>' (which were escaped by
- # removeHTMLtags()) should not be included in
- # URLs, per RFC 2396.
- pos = max(url.find('<'), url.find('>'))
- if pos != -1:
- trail = url[pos:] + trail
- url = url[0:pos]
-
- sep = ',;.:!?'
- if '(' not in url:
- sep += ')'
-
- i = len(url)-1
- while i >= 0:
- char = url[i]
- if char not in sep:
- break
- i -= 1
- i += 1
-
- if i != len(url):
- trail = url[i:] + trail
- url = url[0:i]
-
- url = cleanURL(url)
-
- sb.append(u'<a href="')
- sb.append(url)
- sb.append(u'">')
- sb.append(truncate_url(url))
- sb.append(u'</a>')
- #sb.append(text)
- sb.append(trail)
- else:
- sb.append(protocol)
- sb.append(remainder)
- return ''.join(sb)
- def urlencode(self, char):
- num = ord(char)
- if num == 32:
- return '+'
- return "%%%02x" % num
- def cleanURL(self, url):
- # Normalize any HTML entities in input. They will be
- # re-escaped by makeExternalLink().
- url = self.decodeCharReferences(url)
-
- # Escape any control characters introduced by the above step
- url = _controlCharsPat.sub(self.urlencode, url)
-
- # Validate hostname portion
- match = _hostnamePat.match(url)
- if match:
- protocol, host, rest = match.groups()
-
- # Characters that will be ignored in IDNs.
- # http://tools.ietf.org/html/3454#section-3.1
- # Strip them before further processing so blacklists and such work.
-
- _stripPat.sub('', host)
-
- # @fixme: validate hostnames here
-
- return protocol + host + rest
- else:
- return url
- def unstripForHTML(self, text):
- text = self.unstrip(text)
- text = self.unstripNoWiki(text)
- return text
- def unstrip(self, text):
- if 'general' not in self.strip_state:
- return text
- general = self.strip_state['general']
- for k in general:
- v = general[k]
- text = text.replace(k, v)
- return text
- def unstripNoWiki(self, text):
- if 'nowiki' not in self.strip_state:
- return text
- nowiki = self.strip_state['nowiki']
- for k in nowiki:
- v = nowiki[k]
- text = text.replace(k, v)
- return text
- def extractTagsAndParams(self, elements, text, matches):
- """
- Replaces all occurrences of HTML-style comments and the given tags
- in the text with a random marker and returns teh next text. The output
- parameter $matches will be an associative array filled with data in
- the form:
- 'UNIQ-xxxxx' => array(
- 'element',
- 'tag content',
- array( 'param' => 'x' ),
- '<element param="x">tag content</element>' ) )
- """
- stripped = u''
-
- taglist = u'|'.join(elements)
- if taglist not in _startRegexHash:
- _startRegexHash[taglist] = re.compile(ur"<(" + taglist + ur")(\s+[^>]*?|\s*?)(/?>)|<(!--)", re.UNICODE | re.IGNORECASE)
- start = _startRegexHash[taglist]
-
- while text != u'':
- p = start.split(text, 1)
- stripped += p[0]
- if len(p) == 1:
- break
- elif p[4]:
- # comment
- element = p[4]
- attributes = u''
- close = u''
- else:
- element = p[1]
- attributes = p[2]
- close = p[3]
- inside = p[5]
-
- global _extractTagsAndParams_n
- marker = self.uniq_prefix + u'-' + element + u'-' + (u"%08X" % _extractTagsAndParams_n) + u'-QINU'
- _extractTagsAndParams_n += 1
- stripped += marker
-
- if close == u'/>':
- # empty element tag, <tag />
- content = None
- text = inside
- tail = None
- else:
- if element == u'!--':
- end = _endCommentPat
- else:
- if element not in _endRegexHash:
- _endRegexHash[element] = re.compile(ur'(</' + element + ur'\s*>)', re.UNICODE | re.IGNORECASE)
- end = _endRegexHash[element]
- q = end.split(inside, 1)
- content = q[0]
- if len(q) < 3:
- # no end tag
- tail = ''
- text = ''
- else:
- tail = q[1]
- text = q[2]
-
- matches[marker] = (
- element,
- content,
- self.decodeTagAttributes(attributes),
- u"<" + element + attributes + close + content + tail
- )
- return stripped
- def fixtags(self, text):
- """Clean up special characters, only run once, next-to-last before doBlockLevels"""
- # french spaces, last one Guillemet-left
- # only if there is something before the space
- text = _guillemetLeftPat.sub(ur'\1 \2', text)
- # french spaces, Guillemet-right
- text = _guillemetRightPat.sub(ur'\1 ', text)
- return text
- def closeParagraph(self, mLastSection):
- """Used by doBlockLevels()"""
- result = u''
- if mLastSection != u'':
- result = u'</' + mLastSection + u'>\n'
-
- return result
- def getCommon(self, st1, st2):
- """
- getCommon() returns the length of the longest common substring
- of both arguments, starting at the beginning of both.
- """
- fl = len(st1)
- shorter = len(st2)
- if fl < shorter:
- shorter = fl
-
- i = 0
- while i < shorter:
- if st1[i] != st2[i]:
- break
- i += 1
- return i
- def openList(self, char, mLastSection):
- """
- These next three functions open, continue, and close the list
- element appropriate to the prefix character passed into them.
- """
- result = self.closeParagraph(mLastSection)
-
- mDTopen = False
- if char == u'*':
- result += u'<ul><li>'
- elif char == u'#':
- result += u'<ol><li>'
- elif char == u':':
- result += u'<dl><dd>'
- elif char == u';':
- result += u'<dl><dt>'
- mDTopen = True
- else:
- result += u'<!-- ERR 1 -->'
-
- return result, mDTopen
- def nextItem(self, char, mDTopen):
- if char == u'*' or char == '#':
- return u'</li><li>', None
- elif char == u':' or char == u';':
- close = u'</dd>'
- if mDTopen:
- close = '</dt>'
- if char == u';':
- return close + u'<dt>', True
- else:
- return close + u'<dd>', False
- return u'<!-- ERR 2 -->'
- def closeList(self, char, mDTopen):
- if char == u'*':
- return u'</li></ul>\n'
- elif char == u'#':
- return u'</li></ol>\n'
- elif char == u':':
- if mDTopen:
- return u'</dt></dl>\n'
- else:
- return u'</dd></dl>\n'
- else:
- return u'<!-- ERR 3 -->'
- def findColonNoLinks(self, text, before, after):
- try:
- pos = text.search(':')
- except:
- return False
- lt = text.find('<')
- if lt == -1 or lt > pos:
- # Easy; no tag nesting to worry about
- before = text[0:pos]
- after = text[0:pos+1]
- return before, after, pos
- # Ugly state machine to walk through avoiding tags.
- state = MW_COLON_STATE_TEXT;
- stack = 0;
- i = 0
- while i < len(text):
- c = text[i];
- if state == 0: # MW_COLON_STATE_TEXT:
- if text[i] == '<':
- # Could be either a <start> tag or an </end> tag
- state = MW_COLON_STATE_TAGSTART
- elif text[i] == ':':
- …
Large files files are truncated, but you can click here to view the full file