PageRenderTime 108ms CodeModel.GetById 35ms RepoModel.GetById 0ms app.codeStats 1ms

/compat/wikimarkup.py

https://bitbucket.org/piranha/byteflow/
Python | 2143 lines | 2118 code | 5 blank | 20 comment | 1 complexity | d6cfd41875897e3e83ab13e9e620793b MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. # -*- encoding: utf-8 -*-
  2. """
  3. MediaWiki-style markup
  4. Copyright (C) 2008 David Cramer <dcramer@gmail.com>
  5. This program is free software: you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation, either version 3 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. """
  16. import re, random, locale
  17. from base64 import b64encode, b64decode
  18. # a few patterns we use later
  19. MW_COLON_STATE_TEXT = 0
  20. MW_COLON_STATE_TAG = 1
  21. MW_COLON_STATE_TAGSTART = 2
  22. MW_COLON_STATE_CLOSETAG = 3
  23. MW_COLON_STATE_TAGSLASH = 4
  24. MW_COLON_STATE_COMMENT = 5
  25. MW_COLON_STATE_COMMENTDASH = 6
  26. MW_COLON_STATE_COMMENTDASHDASH = 7
  27. _attributePat = re.compile(ur'''(?:^|\s)([A-Za-z0-9]+)(?:\s*=\s*(?:"([^<"]*)"|'([^<']*)'|([a-zA-Z0-9!#$%&()*,\-./:;<>?@[\]^_`{|}~]+)|#([0-9a-fA-F]+)))''', re.UNICODE)
  28. _space = re.compile(ur'\s+', re.UNICODE)
  29. _closePrePat = re.compile(u"</pre", re.UNICODE | re.IGNORECASE)
  30. _openPrePat = re.compile(u"<pre", re.UNICODE | re.IGNORECASE)
  31. _openMatchPat = re.compile(u"(<table|<blockquote|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|<p|<ul|<ol|<li|</center|</tr|</td|</th)", re.UNICODE | re.IGNORECASE)
  32. _tagPattern = re.compile(ur'^(/?)(\w+)([^>]*?)(/?>)([^<]*)$', re.UNICODE)
  33. _htmlpairs = ( # Tags that must be closed
  34. u'b', u'del', u'i', u'ins', u'u', u'font', u'big', u'small', u'sub', u'sup', u'h1',
  35. u'h2', u'h3', u'h4', u'h5', u'h6', u'cite', u'code', u'em', u's',
  36. u'strike', u'strong', u'tt', u'var', u'div', u'center',
  37. u'blockquote', u'ol', u'ul', u'dl', u'table', u'caption', u'pre',
  38. u'ruby', u'rt' , u'rb' , u'rp', u'p', u'span', u'u',
  39. )
  40. _htmlsingle = (
  41. u'br', u'hr', u'li', u'dt', u'dd', u'img',
  42. )
  43. _htmlsingleonly = ( # Elements that cannot have close tags
  44. u'br', u'hr', u'img',
  45. )
  46. _htmlnest = ( # Tags that can be nested--??
  47. u'table', u'tr', u'td', u'th', u'div', u'blockquote', u'ol', u'ul',
  48. u'dl', u'font', u'big', u'small', u'sub', u'sup', u'span', u'img',
  49. )
  50. _tabletags = ( # Can only appear inside table
  51. u'td', u'th', u'tr',
  52. )
  53. _htmllist = ( # Tags used by list
  54. u'ul', u'ol',
  55. )
  56. _listtags = ( # Tags that can appear in a list
  57. u'li',
  58. )
  59. _htmlsingleallowed = _htmlsingle + _tabletags
  60. _htmlelements = _htmlsingle + _htmlpairs + _htmlnest
  61. _htmlEntities = {
  62. u'Aacute': 193, u'aacute': 225, u'Acirc': 194, u'acirc': 226, u'acute': 180,
  63. u'AElig': 198, u'aelig': 230, u'Agrave': 192, u'agrave': 224, u'alefsym': 8501,
  64. u'Alpha': 913, u'alpha': 945, u'amp': 38, u'and': 8743, u'ang': 8736, u'Aring': 197,
  65. u'aring': 229,
  66. u'asymp': 8776,
  67. u'Atilde': 195,
  68. u'atilde': 227,
  69. u'Auml': 196,
  70. u'auml': 228,
  71. u'bdquo': 8222,
  72. u'Beta': 914,
  73. u'beta': 946,
  74. u'brvbar': 166,
  75. u'bull': 8226,
  76. u'cap': 8745,
  77. u'Ccedil': 199,
  78. u'ccedil': 231,
  79. u'cedil': 184,
  80. u'cent': 162,
  81. u'Chi': 935,
  82. u'chi': 967,
  83. u'circ': 710,
  84. u'clubs': 9827,
  85. u'cong': 8773,
  86. u'copy': 169,
  87. u'crarr': 8629,
  88. u'cup': 8746,
  89. u'curren': 164,
  90. u'dagger': 8224,
  91. u'Dagger': 8225,
  92. u'darr': 8595,
  93. u'dArr': 8659,
  94. u'deg': 176,
  95. u'Delta': 916,
  96. u'delta': 948,
  97. u'diams': 9830,
  98. u'divide': 247,
  99. u'Eacute': 201,
  100. u'eacute': 233,
  101. u'Ecirc': 202,
  102. u'ecirc': 234,
  103. u'Egrave': 200,
  104. u'egrave': 232,
  105. u'empty': 8709,
  106. u'emsp': 8195,
  107. u'ensp': 8194,
  108. u'Epsilon': 917,
  109. u'epsilon': 949,
  110. u'equiv': 8801,
  111. u'Eta': 919,
  112. u'eta': 951,
  113. u'ETH': 208,
  114. u'eth': 240,
  115. u'Euml': 203,
  116. u'euml': 235,
  117. u'euro': 8364,
  118. u'exist': 8707,
  119. u'fnof': 402,
  120. u'forall': 8704,
  121. u'frac12': 189,
  122. u'frac14': 188,
  123. u'frac34': 190,
  124. u'frasl': 8260,
  125. u'Gamma': 915,
  126. u'gamma': 947,
  127. u'ge': 8805,
  128. u'gt': 62,
  129. u'harr': 8596,
  130. u'hArr': 8660,
  131. u'hearts': 9829,
  132. u'hellip': 8230,
  133. u'Iacute': 205,
  134. u'iacute': 237,
  135. u'Icirc': 206,
  136. u'icirc': 238,
  137. u'iexcl': 161,
  138. u'Igrave': 204,
  139. u'igrave': 236,
  140. u'image': 8465,
  141. u'infin': 8734,
  142. u'int': 8747,
  143. u'Iota': 921,
  144. u'iota': 953,
  145. u'iquest': 191,
  146. u'isin': 8712,
  147. u'Iuml': 207,
  148. u'iuml': 239,
  149. u'Kappa': 922,
  150. u'kappa': 954,
  151. u'Lambda': 923,
  152. u'lambda': 955,
  153. u'lang': 9001,
  154. u'laquo': 171,
  155. u'larr': 8592,
  156. u'lArr': 8656,
  157. u'lceil': 8968,
  158. u'ldquo': 8220,
  159. u'le': 8804,
  160. u'lfloor': 8970,
  161. u'lowast': 8727,
  162. u'loz': 9674,
  163. u'lrm': 8206,
  164. u'lsaquo': 8249,
  165. u'lsquo': 8216,
  166. u'lt': 60,
  167. u'macr': 175,
  168. u'mdash': 8212,
  169. u'micro': 181,
  170. u'middot': 183,
  171. u'minus': 8722,
  172. u'Mu': 924,
  173. u'mu': 956,
  174. u'nabla': 8711,
  175. u'nbsp': 160,
  176. u'ndash': 8211,
  177. u'ne': 8800,
  178. u'ni': 8715,
  179. u'not': 172,
  180. u'notin': 8713,
  181. u'nsub': 8836,
  182. u'Ntilde': 209,
  183. u'ntilde': 241,
  184. u'Nu': 925,
  185. u'nu': 957,
  186. u'Oacute': 211,
  187. u'oacute': 243,
  188. u'Ocirc': 212,
  189. u'ocirc': 244,
  190. u'OElig': 338,
  191. u'oelig': 339,
  192. u'Ograve': 210,
  193. u'ograve': 242,
  194. u'oline': 8254,
  195. u'Omega': 937,
  196. u'omega': 969,
  197. u'Omicron': 927,
  198. u'omicron': 959,
  199. u'oplus': 8853,
  200. u'or': 8744,
  201. u'ordf': 170,
  202. u'ordm': 186,
  203. u'Oslash': 216,
  204. u'oslash': 248,
  205. u'Otilde': 213,
  206. u'otilde': 245,
  207. u'otimes': 8855,
  208. u'Ouml': 214,
  209. u'ouml': 246,
  210. u'para': 182,
  211. u'part': 8706,
  212. u'permil': 8240,
  213. u'perp': 8869,
  214. u'Phi': 934,
  215. u'phi': 966,
  216. u'Pi': 928,
  217. u'pi': 960,
  218. u'piv': 982,
  219. u'plusmn': 177,
  220. u'pound': 163,
  221. u'prime': 8242,
  222. u'Prime': 8243,
  223. u'prod': 8719,
  224. u'prop': 8733,
  225. u'Psi': 936,
  226. u'psi': 968,
  227. u'quot': 34,
  228. u'radic': 8730,
  229. u'rang': 9002,
  230. u'raquo': 187,
  231. u'rarr': 8594,
  232. u'rArr': 8658,
  233. u'rceil': 8969,
  234. u'rdquo': 8221,
  235. u'real': 8476,
  236. u'reg': 174,
  237. u'rfloor': 8971,
  238. u'Rho': 929,
  239. u'rho': 961,
  240. u'rlm': 8207,
  241. u'rsaquo': 8250,
  242. u'rsquo': 8217,
  243. u'sbquo': 8218,
  244. u'Scaron': 352,
  245. u'scaron': 353,
  246. u'sdot': 8901,
  247. u'sect': 167,
  248. u'shy': 173,
  249. u'Sigma': 931,
  250. u'sigma': 963,
  251. u'sigmaf': 962,
  252. u'sim': 8764,
  253. u'spades': 9824,
  254. u'sub': 8834,
  255. u'sube': 8838,
  256. u'sum': 8721,
  257. u'sup': 8835,
  258. u'sup1': 185,
  259. u'sup2': 178,
  260. u'sup3': 179,
  261. u'supe': 8839,
  262. u'szlig': 223,
  263. u'Tau': 932,
  264. u'tau': 964,
  265. u'there4': 8756,
  266. u'Theta': 920,
  267. u'theta': 952,
  268. u'thetasym': 977,
  269. u'thinsp': 8201,
  270. u'THORN': 222,
  271. u'thorn': 254,
  272. u'tilde': 732,
  273. u'times': 215,
  274. u'trade': 8482,
  275. u'Uacute': 218,
  276. u'uacute': 250,
  277. u'uarr': 8593,
  278. u'uArr': 8657,
  279. u'Ucirc': 219,
  280. u'ucirc': 251,
  281. u'Ugrave': 217,
  282. u'ugrave': 249,
  283. u'uml': 168,
  284. u'upsih': 978,
  285. u'Upsilon': 933,
  286. u'upsilon': 965,
  287. u'Uuml': 220,
  288. u'uuml': 252,
  289. u'weierp': 8472,
  290. u'Xi': 926,
  291. u'xi': 958,
  292. u'Yacute': 221,
  293. u'yacute': 253,
  294. u'yen': 165,
  295. u'Yuml': 376,
  296. u'yuml': 255,
  297. u'Zeta': 918,
  298. u'zeta': 950,
  299. u'zwj': 8205,
  300. u'zwnj': 8204
  301. }
  302. _charRefsPat = re.compile(ur'''(&([A-Za-z0-9]+);|&#([0-9]+);|&#[xX]([0-9A-Za-z]+);|(&))''', re.UNICODE)
  303. _cssCommentPat = re.compile(ur'''\*.*?\*''', re.UNICODE)
  304. _toUTFPat = re.compile(ur'''\\([0-9A-Fa-f]{1,6})[\s]?''', re.UNICODE)
  305. _hackPat = re.compile(ur'''(expression|tps*://|url\s*\().*''', re.UNICODE | re.IGNORECASE)
  306. _hrPat = re.compile(u'''^-----*''', re.UNICODE | re.MULTILINE)
  307. _h1Pat = re.compile(u'^=(.+)=\s*$', re.UNICODE | re.MULTILINE)
  308. _h2Pat = re.compile(u'^==(.+)==\s*$', re.UNICODE | re.MULTILINE)
  309. _h3Pat = re.compile(u'^===(.+)===\s*$', re.UNICODE | re.MULTILINE)
  310. _h4Pat = re.compile(u'^====(.+)====\s*$', re.UNICODE | re.MULTILINE)
  311. _h5Pat = re.compile(u'^=====(.+)=====\s*$', re.UNICODE | re.MULTILINE)
  312. _h6Pat = re.compile(u'^======(.+)======\s*$', re.UNICODE | re.MULTILINE)
  313. _quotePat = re.compile(u"""(''+)""", re.UNICODE)
  314. _removePat = re.compile(ur'\b(' + ur'|'.join((u"a", u"an", u"as", u"at", u"before", u"but", u"by", u"for", u"from",
  315. u"is", u"in", u"into", u"like", u"of", u"off", u"on", u"onto", u"per",
  316. u"since", u"than", u"the", u"this", u"that", u"to", u"up", u"via",
  317. u"with")) + ur')\b', re.UNICODE | re.IGNORECASE)
  318. _nonWordSpaceDashPat = re.compile(ur'[^\w\s\-\./]', re.UNICODE)
  319. _multiSpacePat = re.compile(ur'[\s\-_\./]+', re.UNICODE)
  320. _spacePat = re.compile(ur' ', re.UNICODE)
  321. _linkPat = re.compile(ur'^(?:([A-Za-z0-9]+):)?([^\|]+)(?:\|([^\n]+?))?\]\](.*)$', re.UNICODE | re.DOTALL)
  322. _bracketedLinkPat = re.compile(ur'(?:\[((?:mailto:|irc://|https?://|ftp://|/)[^<>\]\[' + u"\x00-\x20\x7f" + ur']*)\s*(.*?)\])', re.UNICODE)
  323. _protocolPat = re.compile(ur'(\b(?:mailto:|irc://|https?://|ftp://))', re.UNICODE)
  324. _specialUrlPat = re.compile(ur'^([^<>\]\[' + u"\x00-\x20\x7f" + ur']+)(.*)$', re.UNICODE)
  325. _protocolsPat = re.compile(ur'^(mailto:|irc://|https?://|ftp://)$', re.UNICODE)
  326. _controlCharsPat = re.compile(ur'[\]\[<>"' + u"\\x00-\\x20\\x7F" + ur']]', re.UNICODE)
  327. _hostnamePat = re.compile(ur'^([^:]+:)(//[^/]+)?(.*)$', re.UNICODE)
  328. _stripPat = re.compile(u'\\s|\u00ad|\u1806|\u200b|\u2060|\ufeff|\u03f4|\u034f|\u180b|\u180c|\u180d|\u200c|\u200d|[\ufe00-\ufe0f]', re.UNICODE)
  329. _zomgPat = re.compile(ur'^(:*)\{\|(.*)$', re.UNICODE)
  330. _headerPat = re.compile(ur"<[Hh]([1-6])(.*?)>(.*?)</[Hh][1-6] *>", re.UNICODE)
  331. _templateSectionPat = re.compile(ur"<!--MWTEMPLATESECTION=([^&]+)&([^_]+)-->", re.UNICODE)
  332. _tagPat = re.compile(ur"<.*?>", re.UNICODE)
  333. _startRegexHash = {}
  334. _endRegexHash = {}
  335. _endCommentPat = re.compile(ur'(-->)', re.UNICODE)
  336. _extractTagsAndParams_n = 1
  337. _guillemetLeftPat = re.compile(ur'(.) (\?|:|;|!|\302\273)', re.UNICODE)
  338. _guillemetRightPat = re.compile(ur'(\302\253) ', re.UNICODE)
  339. def setupAttributeWhitelist():
  340. common = ( u'id', u'class', u'lang', u'dir', u'title', u'style' )
  341. block = common + (u'align',)
  342. tablealign = ( u'align', u'char', u'charoff', u'valign' )
  343. tablecell = ( u'abbr',
  344. u'axis',
  345. u'headers',
  346. u'scope',
  347. u'rowspan',
  348. u'colspan',
  349. u'nowrap', # deprecated
  350. u'width', # deprecated
  351. u'height', # deprecated
  352. u'bgcolor' # deprecated
  353. )
  354. return {
  355. u'div': block,
  356. u'center': common, # deprecated
  357. u'span': block, # ??
  358. u'h1': block,
  359. u'h2': block,
  360. u'h3': block,
  361. u'h4': block,
  362. u'h5': block,
  363. u'h6': block,
  364. u'em': common,
  365. u'strong': common,
  366. u'cite': common,
  367. u'code': common,
  368. u'var': common,
  369. u'img': common + (u'src', u'alt', u'width', u'height',),
  370. u'blockquote': common + (u'cite',),
  371. u'sub': common,
  372. u'sup': common,
  373. u'p': block,
  374. u'br': (u'id', u'class', u'title', u'style', u'clear',),
  375. u'pre': common + (u'width',),
  376. u'ins': common + (u'cite', u'datetime'),
  377. u'del': common + (u'cite', u'datetime'),
  378. u'ul': common + (u'type',),
  379. u'ol': common + (u'type', u'start'),
  380. u'li': common + (u'type', u'value'),
  381. u'dl': common,
  382. u'dd': common,
  383. u'dt': common,
  384. u'table': common + ( u'summary', u'width', u'border', u'frame',
  385. u'rules', u'cellspacing', u'cellpadding',
  386. u'align', u'bgcolor',
  387. ),
  388. u'caption': common + (u'align',),
  389. u'thead': common + tablealign,
  390. u'tfoot': common + tablealign,
  391. u'tbody': common + tablealign,
  392. u'colgroup': common + ( u'span', u'width' ) + tablealign,
  393. u'col': common + ( u'span', u'width' ) + tablealign,
  394. u'tr': common + ( u'bgcolor', ) + tablealign,
  395. u'td': common + tablecell + tablealign,
  396. u'th': common + tablecell + tablealign,
  397. u'tt': common,
  398. u'b': common,
  399. u'i': common,
  400. u'big': common,
  401. u'small': common,
  402. u'strike': common,
  403. u's': common,
  404. u'u': common,
  405. u'font': common + ( u'size', u'color', u'face' ),
  406. u'hr': common + ( u'noshade', u'size', u'width' ),
  407. u'ruby': common,
  408. u'rb': common,
  409. u'rt': common, #array_merge( $common, array( 'rbspan' ) ),
  410. u'rp': common,
  411. }
  412. _whitelist = setupAttributeWhitelist()
  413. _page_cache = {}
  414. env = {}
  415. def registerTagHook(tag, function):
  416. mTagHooks[tag] = function
  417. class BaseParser(object):
  418. def __init__(self):
  419. self.uniq_prefix = u"\x07UNIQ" + unicode(random.randint(1, 1000000000))
  420. self.strip_state = {}
  421. self.arg_stack = []
  422. self.env = env
  423. self.keep_env = (env != {})
  424. def __del__(self):
  425. if not self.keep_env:
  426. global env
  427. env = {}
  428. ''' Used to store objects in the environment
  429. used to prevent recursive imports '''
  430. def store_object(self, namespace, key, value=True):
  431. # Store the item to not reprocess it
  432. if namespace not in self.env:
  433. self.env[namespace] = {}
  434. self.env[namespace][key] = value
  435. def has_object(self, namespace, key):
  436. if namespace not in self.env:
  437. self.env[namespace] = {}
  438. if hasattr(self, 'count'):
  439. data = self.env[namespace]
  440. test = key in data
  441. ls
  442. self.count = True
  443. return key in self.env[namespace]
  444. def retrieve_object(self, namespace, key, default=None):
  445. if not self.env.get(namespace):
  446. self.env[namespace] = {}
  447. return self.env[namespace].get(key, default)
  448. def parse(self, text):
  449. utf8 = isinstance(text, str)
  450. text = to_unicode(text)
  451. if text[-1:] != u'\n':
  452. text = text + u'\n'
  453. taggedNewline = True
  454. else:
  455. taggedNewline = False
  456. text = self.strip(text)
  457. text = self.removeHtmlTags(text)
  458. text = self.parseHorizontalRule(text)
  459. text = self.parseAllQuotes(text)
  460. text = self.replaceExternalLinks(text)
  461. text = self.unstrip(text)
  462. text = self.fixtags(text)
  463. text = self.doBlockLevels(text, True)
  464. text = self.unstripNoWiki(text)
  465. text = text.split(u'\n')
  466. text = u'\n'.join(text)
  467. if taggedNewline and text[-1:] == u'\n':
  468. text = text[:-1]
  469. if utf8:
  470. return text.encode("utf-8")
  471. return text
  472. def strip(self, text, stripcomments=False, dontstrip=[]):
  473. render = True
  474. commentState = {}
  475. elements = ['nowiki',] + mTagHooks.keys()
  476. if True: #wgRawHtml
  477. elements.append('html')
  478. # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
  479. for k in dontstrip:
  480. if k in elements:
  481. del elements[k]
  482. matches = {}
  483. text = self.extractTagsAndParams(elements, text, matches)
  484. for marker in matches:
  485. element, content, params, tag = matches[marker]
  486. if render:
  487. tagName = element.lower()
  488. if tagName == u'!--':
  489. # comment
  490. output = tag
  491. if tag[-3:] != u'-->':
  492. output += "-->"
  493. elif tagName == u'html':
  494. output = content
  495. elif tagName == u'nowiki':
  496. output = content.replace(u'&', u'&amp;').replace(u'<', u'&lt;').replace(u'>', u'&gt;')
  497. else:
  498. if tagName in mTagHooks:
  499. output = mTagHooks[tagName](self, content, params)
  500. else:
  501. output = content.replace(u'&', u'&amp;').replace(u'<', u'&lt;').replace(u'>', u'&gt;')
  502. else:
  503. # Just stripping tags; keep the source
  504. output = tag
  505. # Unstrip the output, because unstrip() is no longer recursive so
  506. # it won't do it itself
  507. output = self.unstrip(output)
  508. if not stripcomments and element == u'!--':
  509. commentState[marker] = output
  510. elif element == u'html' or element == u'nowiki':
  511. if 'nowiki' not in self.strip_state:
  512. self.strip_state['nowiki'] = {}
  513. self.strip_state['nowiki'][marker] = output
  514. else:
  515. if 'general' not in self.strip_state:
  516. self.strip_state['general'] = {}
  517. self.strip_state['general'][marker] = output
  518. # Unstrip comments unless explicitly told otherwise.
  519. # (The comments are always stripped prior to this point, so as to
  520. # not invoke any extension tags / parser hooks contained within
  521. # a comment.)
  522. if not stripcomments:
  523. # Put them all back and forget them
  524. for k in commentState:
  525. v = commentState[k]
  526. text = text.replace(k, v)
  527. return text
  528. def removeHtmlTags(self, text):
  529. """convert bad tags into HTML identities"""
  530. sb = []
  531. text = self.removeHtmlComments(text)
  532. bits = text.split(u'<')
  533. sb.append(bits.pop(0))
  534. tagstack = []
  535. tablestack = tagstack
  536. for x in bits:
  537. m = _tagPattern.match(x)
  538. if not m:
  539. continue
  540. slash, t, params, brace, rest = m.groups()
  541. t = t.lower()
  542. badtag = False
  543. if t in _htmlelements:
  544. # Check our stack
  545. if slash:
  546. # Closing a tag...
  547. if t in _htmlsingleonly or len(tagstack) == 0:
  548. badtag = True
  549. else:
  550. ot = tagstack.pop()
  551. if ot != t:
  552. if ot in _htmlsingleallowed:
  553. # Pop all elements with an optional close tag
  554. # and see if we find a match below them
  555. optstack = []
  556. optstack.append(ot)
  557. while True:
  558. if len(tagstack) == 0:
  559. break
  560. ot = tagstack.pop()
  561. if ot == t or ot not in _htmlsingleallowed:
  562. break
  563. optstack.append(ot)
  564. if t != ot:
  565. # No match. Push the optinal elements back again
  566. badtag = True
  567. tagstack += reversed(optstack)
  568. else:
  569. tagstack.append(ot)
  570. # <li> can be nested in <ul> or <ol>, skip those cases:
  571. if ot not in _htmllist and t in _listtags:
  572. badtag = True
  573. elif t == u'table':
  574. if len(tablestack) == 0:
  575. bagtag = True
  576. else:
  577. tagstack = tablestack.pop()
  578. newparams = u''
  579. else:
  580. # Keep track for later
  581. if t in _tabletags and u'table' not in tagstack:
  582. badtag = True
  583. elif t in tagstack and t not in _htmlnest:
  584. badtag = True
  585. # Is it a self-closed htmlpair? (bug 5487)
  586. elif brace == u'/>' and t in _htmlpairs:
  587. badTag = True
  588. elif t in _htmlsingleonly:
  589. # Hack to force empty tag for uncloseable elements
  590. brace = u'/>'
  591. elif t in _htmlsingle:
  592. # Hack to not close $htmlsingle tags
  593. brace = None
  594. else:
  595. if t == u'table':
  596. tablestack.append(tagstack)
  597. tagstack = []
  598. tagstack.append(t)
  599. newparams = self.fixTagAttributes(params, t)
  600. if not badtag:
  601. rest = rest.replace(u'>', u'&gt;')
  602. if brace == u'/>':
  603. close = u' /'
  604. else:
  605. close = u''
  606. sb.append(u'<')
  607. sb.append(slash)
  608. sb.append(t)
  609. sb.append(newparams)
  610. sb.append(close)
  611. sb.append(u'>')
  612. sb.append(rest)
  613. continue
  614. sb.append(u'&lt;')
  615. sb.append(x.replace(u'>', u'&gt;'))
  616. # Close off any remaining tags
  617. while tagstack:
  618. t = tagstack.pop()
  619. sb.append(u'</')
  620. sb.append(t)
  621. sb.append(u'>\n')
  622. if t == u'table':
  623. if not tablestack:
  624. break
  625. tagstack = tablestack.pop()
  626. return u''.join(sb)
  627. def removeHtmlComments(self, text):
  628. """remove <!-- text --> comments from given text"""
  629. sb = []
  630. start = text.find(u'<!--')
  631. last = 0
  632. while start != -1:
  633. end = text.find(u'-->', start)
  634. if end == -1:
  635. break
  636. end += 3
  637. spaceStart = max(0, start-1)
  638. spaceEnd = end
  639. while text[spaceStart] == u' ' and spaceStart > 0:
  640. spaceStart -= 1
  641. while text[spaceEnd] == u' ':
  642. spaceEnd += 1
  643. if text[spaceStart] == u'\n' and text[spaceEnd] == u'\n':
  644. sb.append(text[last:spaceStart])
  645. sb.append(u'\n')
  646. last = spaceEnd+1
  647. else:
  648. sb.append(text[last:spaceStart+1])
  649. last = spaceEnd
  650. start = text.find(u'<!--', end)
  651. sb.append(text[last:])
  652. return u''.join(sb)
  653. def decodeTagAttributes(self, text):
  654. """docstring for decodeTagAttributes"""
  655. attribs = {}
  656. if text.strip() == u'':
  657. return attribs
  658. scanner = _attributePat.scanner(text)
  659. match = scanner.search()
  660. while match:
  661. key, val1, val2, val3, val4 = match.groups()
  662. value = val1 or val2 or val3 or val4
  663. if value:
  664. value = _space.sub(u' ', value).strip()
  665. else:
  666. value = ''
  667. attribs[key] = self.decodeCharReferences(value)
  668. match = scanner.search()
  669. return attribs
  670. def validateTagAttributes(self, attribs, element):
  671. """docstring for validateTagAttributes"""
  672. out = {}
  673. if element not in _whitelist:
  674. return out
  675. whitelist = _whitelist[element]
  676. for attribute in attribs:
  677. value = attribs[attribute]
  678. if attribute not in whitelist:
  679. continue
  680. # Strip javascript "expression" from stylesheets.
  681. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
  682. if attribute == u'style':
  683. value = self.checkCss(value)
  684. if value == False:
  685. continue
  686. elif attribute == u'id':
  687. value = self.escapeId(value)
  688. # If this attribute was previously set, override it.
  689. # Output should only have one attribute of each name.
  690. out[attribute] = value
  691. return out
  692. def safeEncodeAttribute(self, encValue):
  693. """docstring for safeEncodeAttribute"""
  694. encValue = encValue.replace(u'&', u'&amp;')
  695. encValue = encValue.replace(u'<', u'&lt;')
  696. encValue = encValue.replace(u'>', u'&gt;')
  697. encValue = encValue.replace(u'"', u'&quot;')
  698. encValue = encValue.replace(u'{', u'&#123;')
  699. encValue = encValue.replace(u'[', u'&#91;')
  700. encValue = encValue.replace(u"''", u'&#39;&#39;')
  701. encValue = encValue.replace(u'ISBN', u'&#73;SBN')
  702. encValue = encValue.replace(u'RFC', u'&#82;FC')
  703. encValue = encValue.replace(u'PMID', u'&#80;MID')
  704. encValue = encValue.replace(u'|', u'&#124;')
  705. encValue = encValue.replace(u'__', u'&#95;_')
  706. encValue = encValue.replace(u'\n', u'&#10;')
  707. encValue = encValue.replace(u'\r', u'&#13;')
  708. encValue = encValue.replace(u'\t', u'&#9;')
  709. return encValue
  710. def fixTagAttributes(self, text, element):
  711. if text.strip() == u'':
  712. return u''
  713. stripped = self.validateTagAttributes(self.decodeTagAttributes(text), element)
  714. sb = []
  715. for attribute in stripped:
  716. value = stripped[attribute]
  717. encAttribute = attribute.replace(u'&', u'&amp;').replace(u'<', u'&lt;').replace(u'>', u'&gt;')
  718. encValue = self.safeEncodeAttribute(value)
  719. sb.append(u' ')
  720. sb.append(encAttribute)
  721. sb.append(u'="')
  722. sb.append(encValue)
  723. sb.append(u'"')
  724. return u''.join(sb)
  725. def validateCodepoint(self, codepoint):
  726. return codepoint == 0x09 \
  727. or codepoint == 0x0a \
  728. or codepoint == 0x0d \
  729. or (codepoint >= 0x20 and codepoint <= 0xd7ff) \
  730. or (codepoint >= 0xe000 and codepoint <= 0xfffd) \
  731. or (codepoint >= 0x10000 and codepoint <= 0x10ffff)
  732. def _normalizeCallback(self, match):
  733. text, norm, dec, hexval, _ = match.groups()
  734. if norm:
  735. sb = []
  736. sb.append(u'&')
  737. if norm not in _htmlEntities:
  738. sb.append(u'amp;')
  739. sb.append(norm)
  740. sb.append(u';')
  741. return u''.join(sb)
  742. elif dec:
  743. dec = int(dec)
  744. if self.validateCodepoint(dec):
  745. sb = []
  746. sb.append(u'&#')
  747. sb.append(dec)
  748. sb.append(u';')
  749. return u''.join(sb)
  750. elif hexval:
  751. hexval = int(hexval, 16)
  752. if self.validateCodepoint(hexval):
  753. sb = []
  754. sb.append(u'&#x')
  755. sb.append(hex(hexval))
  756. sb.append(u';')
  757. return u''.join(sb)
  758. return text.replace(u'&', u'&amp;').replace(u'<', u'&lt;').replace(u'>', u'&gt;')
  759. def normalizeCharReferences(self, text):
  760. """docstring for normalizeCharReferences"""
  761. return _charRefsPat.sub(self._normalizeCallback, text)
  762. def _decodeCallback(self, match):
  763. text, norm, dec, hexval, _ = match.groups()
  764. if norm:
  765. if norm in _htmlEntities:
  766. return unichr(_htmlEntities[norm])
  767. else:
  768. sb = []
  769. sb.append(u'&')
  770. sb.append(norm)
  771. sb.append(u';')
  772. return u''.join(sb)
  773. elif dec:
  774. dec = int(dec)
  775. if self.validateCodepoint(dec):
  776. return unichr(dec)
  777. return u'?'
  778. elif hexval:
  779. hexval = int(hexval, 16)
  780. if self.validateCodepoint(dec):
  781. return unichr(dec)
  782. return u'?'
  783. return text
  784. def decodeCharReferences(self, text):
  785. """docstring for decodeCharReferences"""
  786. if text:
  787. return _charRefsPat.sub(self._decodeCallback, text)
  788. return ''
  789. def _convertToUtf8(self, s):
  790. return unichr(int(s.group(1), 16))
  791. def checkCss(self, value):
  792. """docstring for checkCss"""
  793. stripped = self.decodeCharReferences(value)
  794. stripped = _cssCommentPat.sub(u'', stripped)
  795. value = stripped
  796. stripped = _toUTFPat.sub(self._convertToUtf8, stripped)
  797. stripped.replace(u'\\', u'')
  798. if _hackPat.search(stripped):
  799. # someone is haxx0ring
  800. return False
  801. return value
  802. def escapeId(self, value):
  803. """docstring for escapeId"""
  804. # TODO
  805. return safe_name(value)
  806. def parseHorizontalRule(self, text):
  807. return _hrPat.sub(ur'<hr />', text)
  808. def parseHeaders(self, text):
  809. text = _h6Pat.sub(ur'<h6>\1</h6>', text)
  810. text = _h5Pat.sub(ur'<h5>\1</h5>', text)
  811. text = _h4Pat.sub(ur'<h4>\1</h4>', text)
  812. text = _h3Pat.sub(ur'<h3>\1</h3>', text)
  813. text = _h2Pat.sub(ur'<h2>\1</h2>', text)
  814. text = _h1Pat.sub(ur'<h1>\1</h1>', text)
  815. return text
  816. def parseQuotes(self, text):
  817. arr = _quotePat.split(text)
  818. if len(arr) == 1:
  819. return text
  820. # First, do some preliminary work. This may shift some apostrophes from
  821. # being mark-up to being text. It also counts the number of occurrences
  822. # of bold and italics mark-ups.
  823. numBold = 0
  824. numItalics = 0
  825. for i,r in zip(range(len(arr)), arr):
  826. if i%2 == 1:
  827. l = len(r)
  828. if l == 4:
  829. arr[i-1] += u"'"
  830. arr[i] = u"'''"
  831. elif l > 5:
  832. arr[i-1] += u"'" * (len(arr[i]) - 5)
  833. arr[i] = u"'''''"
  834. if l == 2:
  835. numItalics += 1
  836. elif l >= 5:
  837. numItalics += 1
  838. numBold += 1
  839. else:
  840. numBold += 1
  841. # If there is an odd number of both bold and italics, it is likely
  842. # that one of the bold ones was meant to be an apostrophe followed
  843. # by italics. Which one we cannot know for certain, but it is more
  844. # likely to be one that has a single-letter word before it.
  845. if numBold%2 == 1 and numItalics%2 == 1:
  846. firstSingleLetterWord = -1
  847. firstMultiLetterWord = -1
  848. firstSpace = -1
  849. for i,r in zip(range(len(arr)), arr):
  850. if i%2 == 1 and len(r) == 3:
  851. x1 = arr[i-1][-1:]
  852. x2 = arr[i-1][-2:-1]
  853. if x1 == u' ':
  854. if firstSpace == -1:
  855. firstSpace = i
  856. elif x2 == u' ':
  857. if firstSingleLetterWord == -1:
  858. firstSingleLetterWord = i
  859. else:
  860. if firstMultiLetterWord == -1:
  861. firstMultiLetterWord = i
  862. # If there is a single-letter word, use it!
  863. if firstSingleLetterWord > -1:
  864. arr[firstSingleLetterWord] = u"''"
  865. arr[firstSingleLetterWord-1] += u"'"
  866. # If not, but there's a multi-letter word, use that one.
  867. elif firstMultiLetterWord > -1:
  868. arr[firstMultiLetterWord] = u"''"
  869. arr[firstMultiLetterWord-1] += u"'"
  870. # ... otherwise use the first one that has neither.
  871. # (notice that it is possible for all three to be -1 if, for example,
  872. # there is only one pentuple-apostrophe in the line)
  873. elif firstSpace > -1:
  874. arr[firstSpace] = u"''"
  875. arr[firstSpace-1] += u"'"
  876. # Now let's actually convert our apostrophic mush to HTML!
  877. output = []
  878. buffer = None
  879. state = ''
  880. for i,r in zip(range(len(arr)), arr):
  881. if i%2 == 0:
  882. if state == 'both':
  883. buffer.append(r)
  884. else:
  885. output.append(r)
  886. else:
  887. if len(r) == 2:
  888. if state == 'i':
  889. output.append(u"</i>")
  890. state = ''
  891. elif state == 'bi':
  892. output.append(u"</i>")
  893. state = 'b'
  894. elif state == 'ib':
  895. output.append(u"</b></i><b>")
  896. state = 'b'
  897. elif state == 'both':
  898. output.append(u"<b><i>")
  899. output.append(u''.join(buffer))
  900. buffer = None
  901. output.append(u"</i>")
  902. state = 'b'
  903. elif state == 'b':
  904. output.append(u"<i>")
  905. state = 'bi'
  906. else: # ''
  907. output.append(u"<i>")
  908. state = 'i'
  909. elif len(r) == 3:
  910. if state == 'b':
  911. output.append(u"</b>")
  912. state = ''
  913. elif state == 'bi':
  914. output.append(u"</i></b><i>")
  915. state = 'i'
  916. elif state == 'ib':
  917. output.append(u"</b>")
  918. state = 'i'
  919. elif state == 'both':
  920. output.append(u"<i><b>")
  921. output.append(u''.join(buffer))
  922. buffer = None
  923. output.append(u"</b>")
  924. state = 'i'
  925. elif state == 'i':
  926. output.append(u"<b>")
  927. state = 'ib'
  928. else: # ''
  929. output.append(u"<b>")
  930. state = 'b'
  931. elif len(r) == 5:
  932. if state == 'b':
  933. output.append(u"</b><i>")
  934. state = 'i'
  935. elif state == 'i':
  936. output.append(u"</i><b>")
  937. state = 'b'
  938. elif state == 'bi':
  939. output.append(u"</i></b>")
  940. state = ''
  941. elif state == 'ib':
  942. output.append(u"</b></i>")
  943. state = ''
  944. elif state == 'both':
  945. output.append(u"<i><b>")
  946. output.append(u''.join(buffer))
  947. buffer = None
  948. output.append(u"</b></i>")
  949. state = ''
  950. else: # ''
  951. buffer = []
  952. state = 'both'
  953. if state == 'both':
  954. output.append(u"<i><b>")
  955. output.append(u''.join(buffer))
  956. buffer = None
  957. output.append(u"</b></i>")
  958. elif state != '':
  959. if state == 'b' or state == 'ib':
  960. output.append(u"</b>")
  961. if state == 'i' or state == 'bi' or state == 'ib':
  962. output.append(u"</i>")
  963. if state == 'bi':
  964. output.append(u"</b>")
  965. return u''.join(output)
  966. def parseAllQuotes(self, text):
  967. sb = []
  968. lines = text.split(u'\n')
  969. first = True
  970. for line in lines:
  971. if not first:
  972. sb.append(u'\n')
  973. else:
  974. first = False
  975. sb.append(self.parseQuotes(line))
  976. return u''.join(sb)
  977. def replaceExternalLinks(self, text):
  978. sb = []
  979. bits = _bracketedLinkPat.split(text)
  980. l = len(bits)
  981. i = 0
  982. num_links = 0
  983. while i < l:
  984. if i%3 == 0:
  985. #sb.append(self.replaceFreeExternalLinks(bits[i]))
  986. sb.append(bits[i])
  987. i += 1
  988. else:
  989. sb.append(u'<a href="')
  990. sb.append(bits[i])
  991. sb.append(u'">')
  992. if not bits[i+1]:
  993. num_links += 1
  994. sb.append(to_unicode(truncate_url(bits[i])))
  995. else:
  996. sb.append(bits[i+1])
  997. sb.append(u'</a>')
  998. i += 2
  999. return ''.join(sb)
  1000. # TODO: fix this so it actually works
  1001. def replaceFreeExternalLinks(self, text):
  1002. bits = _protocolPat.split(text)
  1003. sb = [bits.pop(0)]
  1004. i = 0
  1005. l = len(bits)
  1006. while i < l:
  1007. protocol = bits[i]
  1008. remainder = bits[i+1]
  1009. i += 2
  1010. match = _specialUrlPat.match(remainder)
  1011. if match:
  1012. # Found some characters after the protocol that look promising
  1013. url = protocol + match.group(1)
  1014. trail = match.group(2)
  1015. # special case: handle urls as url args:
  1016. # http://www.example.com/foo?=http://www.example.com/bar
  1017. if len(trail) == 0 and len(bits) > i and _protocolsPat.match(bits[i]):
  1018. match = _specialUrlPat.match(remainder)
  1019. if match:
  1020. url += bits[i] + match.group(1)
  1021. i += 2
  1022. trail = match.group(2)
  1023. # The characters '<' and '>' (which were escaped by
  1024. # removeHTMLtags()) should not be included in
  1025. # URLs, per RFC 2396.
  1026. pos = max(url.find('&lt;'), url.find('&gt;'))
  1027. if pos != -1:
  1028. trail = url[pos:] + trail
  1029. url = url[0:pos]
  1030. sep = ',;.:!?'
  1031. if '(' not in url:
  1032. sep += ')'
  1033. i = len(url)-1
  1034. while i >= 0:
  1035. char = url[i]
  1036. if char not in sep:
  1037. break
  1038. i -= 1
  1039. i += 1
  1040. if i != len(url):
  1041. trail = url[i:] + trail
  1042. url = url[0:i]
  1043. url = cleanURL(url)
  1044. sb.append(u'<a href="')
  1045. sb.append(url)
  1046. sb.append(u'">')
  1047. sb.append(truncate_url(url))
  1048. sb.append(u'</a>')
  1049. #sb.append(text)
  1050. sb.append(trail)
  1051. else:
  1052. sb.append(protocol)
  1053. sb.append(remainder)
  1054. return ''.join(sb)
  1055. def urlencode(self, char):
  1056. num = ord(char)
  1057. if num == 32:
  1058. return '+'
  1059. return "%%%02x" % num
  1060. def cleanURL(self, url):
  1061. # Normalize any HTML entities in input. They will be
  1062. # re-escaped by makeExternalLink().
  1063. url = self.decodeCharReferences(url)
  1064. # Escape any control characters introduced by the above step
  1065. url = _controlCharsPat.sub(self.urlencode, url)
  1066. # Validate hostname portion
  1067. match = _hostnamePat.match(url)
  1068. if match:
  1069. protocol, host, rest = match.groups()
  1070. # Characters that will be ignored in IDNs.
  1071. # http://tools.ietf.org/html/3454#section-3.1
  1072. # Strip them before further processing so blacklists and such work.
  1073. _stripPat.sub('', host)
  1074. # @fixme: validate hostnames here
  1075. return protocol + host + rest
  1076. else:
  1077. return url
  1078. def unstripForHTML(self, text):
  1079. text = self.unstrip(text)
  1080. text = self.unstripNoWiki(text)
  1081. return text
  1082. def unstrip(self, text):
  1083. if 'general' not in self.strip_state:
  1084. return text
  1085. general = self.strip_state['general']
  1086. for k in general:
  1087. v = general[k]
  1088. text = text.replace(k, v)
  1089. return text
  1090. def unstripNoWiki(self, text):
  1091. if 'nowiki' not in self.strip_state:
  1092. return text
  1093. nowiki = self.strip_state['nowiki']
  1094. for k in nowiki:
  1095. v = nowiki[k]
  1096. text = text.replace(k, v)
  1097. return text
  1098. def extractTagsAndParams(self, elements, text, matches):
  1099. """
  1100. Replaces all occurrences of HTML-style comments and the given tags
  1101. in the text with a random marker and returns teh next text. The output
  1102. parameter $matches will be an associative array filled with data in
  1103. the form:
  1104. 'UNIQ-xxxxx' => array(
  1105. 'element',
  1106. 'tag content',
  1107. array( 'param' => 'x' ),
  1108. '<element param="x">tag content</element>' ) )
  1109. """
  1110. stripped = u''
  1111. taglist = u'|'.join(elements)
  1112. if taglist not in _startRegexHash:
  1113. _startRegexHash[taglist] = re.compile(ur"<(" + taglist + ur")(\s+[^>]*?|\s*?)(/?>)|<(!--)", re.UNICODE | re.IGNORECASE)
  1114. start = _startRegexHash[taglist]
  1115. while text != u'':
  1116. p = start.split(text, 1)
  1117. stripped += p[0]
  1118. if len(p) == 1:
  1119. break
  1120. elif p[4]:
  1121. # comment
  1122. element = p[4]
  1123. attributes = u''
  1124. close = u''
  1125. else:
  1126. element = p[1]
  1127. attributes = p[2]
  1128. close = p[3]
  1129. inside = p[5]
  1130. global _extractTagsAndParams_n
  1131. marker = self.uniq_prefix + u'-' + element + u'-' + (u"%08X" % _extractTagsAndParams_n) + u'-QINU'
  1132. _extractTagsAndParams_n += 1
  1133. stripped += marker
  1134. if close == u'/>':
  1135. # empty element tag, <tag />
  1136. content = None
  1137. text = inside
  1138. tail = None
  1139. else:
  1140. if element == u'!--':
  1141. end = _endCommentPat
  1142. else:
  1143. if element not in _endRegexHash:
  1144. _endRegexHash[element] = re.compile(ur'(</' + element + ur'\s*>)', re.UNICODE | re.IGNORECASE)
  1145. end = _endRegexHash[element]
  1146. q = end.split(inside, 1)
  1147. content = q[0]
  1148. if len(q) < 3:
  1149. # no end tag
  1150. tail = ''
  1151. text = ''
  1152. else:
  1153. tail = q[1]
  1154. text = q[2]
  1155. matches[marker] = (
  1156. element,
  1157. content,
  1158. self.decodeTagAttributes(attributes),
  1159. u"<" + element + attributes + close + content + tail
  1160. )
  1161. return stripped
  1162. def fixtags(self, text):
  1163. """Clean up special characters, only run once, next-to-last before doBlockLevels"""
  1164. # french spaces, last one Guillemet-left
  1165. # only if there is something before the space
  1166. text = _guillemetLeftPat.sub(ur'\1&nbsp;\2', text)
  1167. # french spaces, Guillemet-right
  1168. text = _guillemetRightPat.sub(ur'\1&nbsp;', text)
  1169. return text
  1170. def closeParagraph(self, mLastSection):
  1171. """Used by doBlockLevels()"""
  1172. result = u''
  1173. if mLastSection != u'':
  1174. result = u'</' + mLastSection + u'>\n'
  1175. return result
  1176. def getCommon(self, st1, st2):
  1177. """
  1178. getCommon() returns the length of the longest common substring
  1179. of both arguments, starting at the beginning of both.
  1180. """
  1181. fl = len(st1)
  1182. shorter = len(st2)
  1183. if fl < shorter:
  1184. shorter = fl
  1185. i = 0
  1186. while i < shorter:
  1187. if st1[i] != st2[i]:
  1188. break
  1189. i += 1
  1190. return i
  1191. def openList(self, char, mLastSection):
  1192. """
  1193. These next three functions open, continue, and close the list
  1194. element appropriate to the prefix character passed into them.
  1195. """
  1196. result = self.closeParagraph(mLastSection)
  1197. mDTopen = False
  1198. if char == u'*':
  1199. result += u'<ul><li>'
  1200. elif char == u'#':
  1201. result += u'<ol><li>'
  1202. elif char == u':':
  1203. result += u'<dl><dd>'
  1204. elif char == u';':
  1205. result += u'<dl><dt>'
  1206. mDTopen = True
  1207. else:
  1208. result += u'<!-- ERR 1 -->'
  1209. return result, mDTopen
  1210. def nextItem(self, char, mDTopen):
  1211. if char == u'*' or char == '#':
  1212. return u'</li><li>', None
  1213. elif char == u':' or char == u';':
  1214. close = u'</dd>'
  1215. if mDTopen:
  1216. close = '</dt>'
  1217. if char == u';':
  1218. return close + u'<dt>', True
  1219. else:
  1220. return close + u'<dd>', False
  1221. return u'<!-- ERR 2 -->'
  1222. def closeList(self, char, mDTopen):
  1223. if char == u'*':
  1224. return u'</li></ul>\n'
  1225. elif char == u'#':
  1226. return u'</li></ol>\n'
  1227. elif char == u':':
  1228. if mDTopen:
  1229. return u'</dt></dl>\n'
  1230. else:
  1231. return u'</dd></dl>\n'
  1232. else:
  1233. return u'<!-- ERR 3 -->'
  1234. def findColonNoLinks(self, text, before, after):
  1235. try:
  1236. pos = text.search(':')
  1237. except:
  1238. return False
  1239. lt = text.find('<')
  1240. if lt == -1 or lt > pos:
  1241. # Easy; no tag nesting to worry about
  1242. before = text[0:pos]
  1243. after = text[0:pos+1]
  1244. return before, after, pos
  1245. # Ugly state machine to walk through avoiding tags.
  1246. state = MW_COLON_STATE_TEXT;
  1247. stack = 0;
  1248. i = 0
  1249. while i < len(text):
  1250. c = text[i];
  1251. if state == 0: # MW_COLON_STATE_TEXT:
  1252. if text[i] == '<':
  1253. # Could be either a <start> tag or an </end> tag
  1254. state = MW_COLON_STATE_TAGSTART
  1255. elif text[i] == ':':
  1256. if stack == 0:
  1257. # we found it
  1258. return text[0:i], text[i+1], i
  1259. else:
  1260. # Skip ahead looking for something interesting
  1261. try:
  1262. colon = text.search(':', i)
  1263. except:
  1264. return False
  1265. lt = text.find('<', i)
  1266. if stack == 0:
  1267. if lt == -1 or colon < lt:
  1268. # we found it
  1269. return text[0:colon], text[colon+1], i
  1270. if lt == -1:
  1271. break
  1272. # Skip ahead to next tag start
  1273. i = lt
  1274. state = MW_COLON_STATE_TAGSTART
  1275. elif state == 1: # MW_COLON_STATE_TAG:
  1276. # In a <tag>
  1277. if text[i] == '>':
  1278. stack += 1
  1279. state = MW_COLON_STATE_TEXT
  1280. elif text[i] == '/':
  1281. state = MW_COLON_STATE_TAGSLASH
  1282. elif state == 2: # MW_COLON_STATE_TAGSTART:
  1283. if text[i] == '/':
  1284. state = MW_COLON_STATE_CLOSETAG
  1285. elif text[i] == '!':
  1286. state = MW_COLON_STATE_COMMENT
  1287. elif text[i] == '>':
  1288. # Illegal early close? This shouldn't happen D:
  1289. state = MW_COLON_STATE_TEXT
  1290. else:
  1291. state = MW_COLON_STATE_TAG
  1292. elif state == 3: # MW_COLON_STATE_CLOSETAG:
  1293. # In a </tag>
  1294. if text[i] == '>':
  1295. stack -= 1
  1296. if stack < 0:
  1297. return False
  1298. state = MW_COLON_STATE_TEXT
  1299. elif state == MW_COLON_STATE_TAGSLASH:
  1300. if text[i] == '>':
  1301. # Yes, a self-closed tag <blah/>
  1302. state = MW_COLON_STATE_TEXT
  1303. else:
  1304. # Probably we're jumping the gun, and this is an attribute
  1305. state = MW_COLON_STATE_TAG
  1306. elif state == 5: # MW_COLON_STATE_COMMENT:
  1307. if text[i] == '-':
  1308. state = MW_COLON_STATE_COMMENTDASH
  1309. elif state == MW_COLON_STATE_COMMENTDASH:
  1310. if text[i] == '-':
  1311. state = MW_COLON_STATE_COMMENTDASHDASH
  1312. else:
  1313. state = MW_COLON_STATE_COMMENT
  1314. elif state == MW_COLON_STATE_COMMENTDASHDASH:
  1315. if text[i] == '>':
  1316. state = MW_COLON_STATE_TEXT
  1317. else:
  1318. state = MW_COLON_STATE_COMMENT
  1319. else:
  1320. raise
  1321. if stack > 0:
  1322. return False
  1323. return False
  1324. def doBlockLevels(self, text, linestart):
  1325. # Parsing through the text line by line. The main thing
  1326. # happening here is handling of block-level elements p, pre,
  1327. # and making lists from lines starting with * # : etc.
  1328. lastPrefix = u''
  1329. mDTopen = inBlockElem = False
  1330. prefixLength = 0
  1331. paragraphStack = False
  1332. _closeMatchPat = re.compile(ur"(</table|</blockquote|</h1|</h2|</h3|</h4|</h5|</h6|<td|<th|<div|</div|<hr|</pre|</p|" + self.uniq_prefix + ur"-pre|</li|</ul|</ol|<center)", re.UNICODE | re.IGNORECASE)
  1333. mInPre = False
  1334. mLastSection = u''
  1335. mDTopen = False
  1336. output = []
  1337. for oLine in text.split('\n')[not linestart and 1 or 0:]:
  1338. lastPrefixLength = len(lastPrefix)
  1339. preCloseMatch = _closePrePat.search(oLine)
  1340. preOpenMatch = _openPrePat.search(oLine)
  1341. if not mInPre:
  1342. chars = u'*#:;'
  1343. prefixLength = 0
  1344. for c in oLine:
  1345. if c in chars:
  1346. prefixLength += 1
  1347. else:
  1348. break
  1349. pref = oLine[0:prefixLength]
  1350. # eh?
  1351. pref2 = pref.replace(u';', u':')
  1352. t = oLine[prefixLength:]
  1353. mInPre = bool(preOpenMatch)
  1354. else:
  1355. # Don't interpret any other prefixes in preformatted text
  1356. prefixLength = 0
  1357. pref = pref2 = u''
  1358. t = oLine
  1359. # List generation
  1360. if prefixLength and lastPrefix == pref2:
  1361. # Same as the last item, so no need to deal with nesting or opening stuff
  1362. tmpOutput, tmpMDTopen = self.nextItem(pref[-1:], mDTopen)
  1363. output.append(tmpOutput)
  1364. if tmpMDTopen is not None:
  1365. mDTopen = tmpMDTopen
  1366. paragraphStack = False
  1367. if pref[-1:] == u';':
  1368. # The one nasty exception: definition lists work like this:
  1369. # ; title : definition text
  1370. # So we check for : in the remainder text to split up the
  1371. # title and definition, without b0rking links.
  1372. term = t2 = u''
  1373. z = self.findColonNoLinks(t, term, t2)
  1374. if z != False:
  1375. term, t2 = z[1:2]
  1376. t = t2
  1377. output.append(term)
  1378. tmpOutput, tmpMDTopen = self.nextItem(u':', mDTopen)
  1379. output.append(tmpOutput)
  1380. if tmpMDTopen is not None:
  1381. mDTopen = tmpMDTopen
  1382. elif prefixLength or lastPrefixLength:
  1383. # Either open or close a level...
  1384. commonPrefixLength = self.getCommon(pref, lastPrefix)
  1385. paragraphStack = False
  1386. while commonPrefixLength < lastPrefixLength:
  1387. tmp = self.closeList(lastPrefix[lastPrefixLength-1], mDTopen)
  1388. output.append(tmp)
  1389. mDTopen = False
  1390. lastPrefixLength -= 1
  1391. if prefixLength <= commonPrefixLength and commonPrefixLength > 0:
  1392. tmpOutput, tmpMDTopen = self.nextItem(pref[commonPrefixLength-1], mDTopen)
  1393. output.append(tmpOutput)
  1394. if tmpMDTopen is not None:
  1395. mDTopen = tmpMDTopen
  1396. while prefixLength > commonPrefixLength:
  1397. char = pref[commonPrefixLength:commonPrefixLength+1]
  1398. tmpOutput, tmpMDTOpen = self.openList(char, mLastSection)
  1399. if tmpMDTOpen:
  1400. mDTopen = True
  1401. output.append(tmpOutput)
  1402. mLastSection = u''
  1403. mInPre = False
  1404. if char == u';':
  1405. # FIXME: This is dupe of code above
  1406. term = t2 = u''
  1407. z = self.findColonNoLinks(t, term, t2)
  1408. if z != False:
  1409. term, t2 = z[1:2]
  1410. t = t2
  1411. output.append(term)
  1412. tmpOutput, tmpMDTopen = self.nextItem(u':', mDTopen)
  1413. output.append(tmpOutput)
  1414. if tmpMDTopen is not None:
  1415. mDTopen = tmpMDTopen
  1416. commonPrefixLength += 1
  1417. lastPrefix = pref2
  1418. if prefixLength == 0:
  1419. # No prefix (not in list)--go to paragraph mode
  1420. # XXX: use a stack for nestable elements like span, table and div
  1421. openmatch = _openMatchPat.search(t)
  1422. closematch = _closeMatchPat.search(t)
  1423. if openmatch or closematch:
  1424. paragraphStack = False
  1425. output.append(self.closeParagraph(mLastSection))
  1426. mLastSection = u''
  1427. if preCloseMatch:
  1428. mInPre = False
  1429. if preOpenMatch:
  1430. mInPre = True
  1431. inBlockElem = bool(not closematch)
  1432. elif not inBlockElem and not mInPre:
  1433. if t[0:1] == u' ' and (mLastSection == u'pre' or t.strip() != u''):
  1434. # pre
  1435. if mLastSection != u'pre':
  1436. paragraphStack = False
  1437. output.append(self.closeParagraph(u'') + u'<pre>')
  1438. mInPre = False
  1439. mLastSection = u'pre'
  1440. t = t[1:]
  1441. else:
  1442. # paragraph
  1443. if t.strip() == u'':
  1444. if paragraphStack:
  1445. output.append(paragraphStack + u'<br />')
  1446. paragraphStack = False
  1447. mLastSection = u'p'
  1448. else:
  1449. if mLastSection != u'p':
  1450. output.append(self.closeParagraph(mLastSection))
  1451. mLastSection = u''
  1452. mInPre = False
  1453. paragraphStack = u'<p>'
  1454. else:
  1455. paragraphStack = u'</p><p>'
  1456. else:
  1457. if paragraphStack:
  1458. output.append(paragraphStack)
  1459. paragraphStack = False
  1460. mLastSection = u'p'
  1461. elif mLastSection != u'p':
  1462. output.append(self.closeParagraph(mLastSection) + u'<p>')
  1463. mLastSection = u'p'
  1464. mInPre = False
  1465. # somewhere above we forget to get out of pre block (bug 785)
  1466. if preCloseMatch and mInPre:
  1467. mInPre = False
  1468. if paragraphStack == False:
  1469. output.append(t + u"\n")
  1470. while prefixLength:
  1471. output.append(self.closeList(pref2[prefixLength-1], mDTopen))
  1472. mDTopen = False
  1473. prefixLength -= 1
  1474. if mLastSection != u'':
  1475. output.append(u'</' + mLastSection + u'>')
  1476. mLastSection = u''
  1477. return ''.join(output)
  1478. class Parser(BaseParser):
  1479. def __init__(self, show_toc=True):
  1480. super(Parser, self).__init__()
  1481. self.show_toc = show_toc
  1482. def parse(self, text):
  1483. utf8 = isinstance(text, str)
  1484. text = to_unicode(text)
  1485. if text[-1:] != u'\n':
  1486. text = text + u'\n'
  1487. taggedNewline = True
  1488. else:
  1489. taggedNewline = False
  1490. text = self.strip(text)
  1491. text = self.removeHtmlTags(text)
  1492. text = self.doTableStuff(text)
  1493. text = self.parseHorizontalRule(text)
  1494. text = self.checkTOC(text)
  1495. text = self.parseHeaders(text)
  1496. text = self.parseAllQuotes(text)
  1497. text = self.replaceExternalLinks(text)
  1498. if not self.show_toc and text.find(u"<!--MWTOC-->") == -1:
  1499. self.show_toc = False
  1500. text = self.formatHeadings(text, True)
  1501. text = self.unstrip(text)
  1502. text = self.fixtags(text)
  1503. text = self.doBlockLevels(text, True)
  1504. text = self.unstripNoWiki(text)
  1505. text = text.split(u'\n')
  1506. text = u'\n'.join(text)
  1507. if taggedNewline and text[-1:] == u'\n':
  1508. text = text[:-1]
  1509. if utf8:
  1510. return text.encode("utf-8")
  1511. return text
  1512. def checkTOC(self, text):
  1513. if text.find(u"__NOTOC__") != -1:
  1514. text = text.replace(u"__NOTOC__", u"")
  1515. self.show_toc = False
  1516. if text.find(u"__TOC__") != -1:
  1517. text = text.replace(u"__TOC__", u"<!--MWTOC-->")
  1518. self.show_toc = True
  1519. return text
  1520. def doTableStuff(self, text):
  1521. t = text.split(u"\n")
  1522. td = [] # Is currently a td tag open?
  1523. ltd = [] # Was it TD or TH?
  1524. tr = [] # Is currently a tr tag open?
  1525. ltr = [] # tr attributes
  1526. has_opened_tr = [] # Did this table open a <tr> element?
  1527. indent_level = 0 # indent level of the table
  1528. for k, x in zip(range(len(t)), t):
  1529. x = x.strip()
  1530. fc = x[0:1]
  1531. matches = _zomgPat.match(x)
  1532. if matches:
  1533. indent_level = len(matches.group(1))
  1534. attributes = self.unstripForHTML(matches.group(2))
  1535. t[k] = u'<dl><dd>'*indent_level + u'<table' + self.fixTagAttributes(attributes, u'table') + u'>'
  1536. td.append(False)
  1537. ltd.append(u'')
  1538. tr.append(False)
  1539. ltr.append(u'')
  1540. has_opened_tr.append(False)
  1541. elif len(td) == 0:
  1542. pass
  1543. elif u'|}' == x[0:2]:
  1544. z = u"</table>" + x[2:]
  1545. l = ltd.pop()
  1546. if not has_opened_tr.pop():
  1547. z = u"<tr><td></td><tr>" + z
  1548. if tr.pop():
  1549. z = u"</tr>" + z
  1550. if td.pop():
  1551. z = u'</' + l + u'>' + z
  1552. ltr.pop()
  1553. t[k] = z + u'</dd></dl>'*indent_level
  1554. elif u'|-' == x[0:2]: # Allows for |-------------
  1555. x = x[1:]
  1556. while x != u'' and x[0:1] == '-':
  1557. x = x[1:]
  1558. z = ''
  1559. l = ltd.pop()
  1560. has_opened_tr.pop()
  1561. has_opened_tr.append(True)
  1562. if tr.pop():
  1563. z = u'</tr>' + z
  1564. if td.pop():
  1565. z = u'</' + l + u'>' + z
  1566. ltr.pop()
  1567. t[k] = z
  1568. tr.append(False)
  1569. td.append(False)
  1570. ltd.append(u'')
  1571. attributes = self.unstripForHTML(x)
  1572. ltr.append(self.fixTagAttributes(attributes, u'tr'))
  1573. elif u'|' == fc or u'!' == fc or u'|+' == x[0:2]: # Caption
  1574. # x is a table row
  1575. if u'|+' == x[0:2]:
  1576. fc = u'+'
  1577. x = x[1:]
  1578. x = x[1:]
  1579. if fc == u'!':
  1580. x = x.replace(u'!!', u'||')
  1581. # Split up multiple cells on the same line.
  1582. # FIXME: This can result in improper nesting of tags processed
  1583. # by earlier parser steps, but should avoid splitting up eg
  1584. # attribute values containing literal "||".
  1585. x = x.split(u'||')
  1586. t[k] = u''
  1587. # Loop through each table cell
  1588. for theline in x:
  1589. z = ''
  1590. if fc != u'+':
  1591. tra = ltr.pop()
  1592. if not tr.pop():
  1593. z = u'<tr' + tra + u'>\n'
  1594. tr.append(True)
  1595. ltr.append(u'')
  1596. has_opened_tr.pop()
  1597. has_opened_tr.append(True)
  1598. l = ltd.pop()
  1599. if td.pop():
  1600. z = u'</' + l + u'>' + z
  1601. if fc == u'|':
  1602. l = u'td'
  1603. elif fc == u'!':
  1604. l = u'th'
  1605. elif fc == u'+':
  1606. l = u'caption'
  1607. else:
  1608. l = u''
  1609. ltd.append(l)
  1610. #Cell parameters
  1611. y = theline.split(u'|', 1)
  1612. # Note that a '|' inside an invalid link should not
  1613. # be mistaken as delimiting cell parameters
  1614. if y[0].find(u'[[') != -1:
  1615. y = [theline]
  1616. if len(y) == 1:
  1617. y = z + u"<" + l + u">" + y[0]
  1618. else:
  1619. attributes = self.unstripForHTML(y[0])
  1620. y = z + u"<" + l + self.fixTagAttributes(attributes, l) + u">" + y[1]
  1621. t[k] += y
  1622. td.append(True)
  1623. while len(td) > 0:
  1624. l = ltd.pop()
  1625. if td.pop():
  1626. t.append(u'</td>')
  1627. if tr.pop():
  1628. t.append(u'</tr>')
  1629. if not has_opened_tr.pop():
  1630. t.append(u'<tr><td></td></tr>')
  1631. t.append(u'</table>')
  1632. text = u'\n'.join(t)
  1633. # special case: don't return empty table
  1634. if text == u"<table>\n<tr><td></td></tr>\n</table>":
  1635. text = u''
  1636. return text
  1637. def formatHeadings(self, text, isMain):
  1638. """
  1639. This function accomplishes several tasks:
  1640. 1) Auto-number headings if that option is enabled
  1641. 2) Add an [edit] link to sections for logged in users who have enabled the option
  1642. 3) Add a Table of contents on the top for users who have enabled the option
  1643. 4) Auto-anchor headings
  1644. It loops through all headlines, collects the necessary data, then splits up the
  1645. string and re-inserts the newly formatted headlines.
  1646. """
  1647. doNumberHeadings = False
  1648. showEditLink = True # Can User Edit
  1649. if text.find(u"__NOEDITSECTION__") != -1:
  1650. showEditLink = False
  1651. text = text.replace(u"__NOEDITSECTION__", u"")
  1652. # Get all headlines for numbering them and adding funky stuff like [edit]
  1653. # links - this is for later, but we need the number of headlines right now
  1654. matches = _headerPat.findall(text)
  1655. numMatches = len(matches)
  1656. # if there are fewer than 4 headlines in the article, do not show TOC
  1657. # unless it's been explicitly enabled.
  1658. enoughToc = self.show_toc and (numMatches >= 4 or text.find(u"<!--MWTOC-->") != -1)
  1659. # Allow user to stipulate that a page should have a "new section"
  1660. # link added via __NEWSECTIONLINK__
  1661. showNewSection = False
  1662. if text.find(u"__NEWSECTIONLINK__") != -1:
  1663. showNewSection = True
  1664. text = text.replace(u"__NEWSECTIONLINK__", u"")
  1665. # if the string __FORCETOC__ (not case-sensitive) occurs in the HTML,
  1666. # override above conditions and always show TOC above first header
  1667. if text.find(u"__FORCETOC__") != -1:
  1668. self.show_toc = True
  1669. enoughToc = True
  1670. text = text.replace(u"__FORCETOC__", u"")
  1671. # Never ever show TOC if no headers
  1672. if numMatches < 1:
  1673. enoughToc = False
  1674. # headline counter
  1675. head…

Large files files are truncated, but you can click here to view the full file