PageRenderTime 104ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/kitsune/wiki/parser.py

https://gitlab.com/Guy1394/kitsune
Python | 495 lines | 487 code | 7 blank | 1 comment | 0 complexity | 7ece2d52d3b0900e48519bd07a86ff5d MD5 | raw file
  1. import re
  2. from itertools import count
  3. from xml.sax.saxutils import quoteattr
  4. from django.conf import settings
  5. from html5lib import HTMLParser
  6. from html5lib.serializer.htmlserializer import HTMLSerializer
  7. from html5lib.treebuilders import getTreeBuilder
  8. from html5lib.treewalkers import getTreeWalker
  9. from lxml.etree import Element
  10. from statsd import statsd
  11. from django.utils.translation import ugettext as _, ugettext_lazy as _lazy
  12. from kitsune.gallery.models import Image
  13. from kitsune.sumo import parser as sumo_parser
  14. from kitsune.sumo.parser import ALLOWED_ATTRIBUTES, get_object_fallback
  15. from kitsune.sumo.utils import uselocale
  16. from kitsune.wiki.models import Document
  17. # block elements wikimarkup knows about (and thus preserves)
  18. BLOCK_LEVEL_ELEMENTS = ['table', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5',
  19. 'h6', 'td', 'th', 'div', 'hr', 'pre', 'p', 'li', 'ul',
  20. 'ol', 'center', 'dl', 'dt', 'dd', 'ins', 'del',
  21. 'section']
  22. TEMPLATE_ARG_REGEX = re.compile('{{{([^{]+?)}}}')
  23. def wiki_to_html(wiki_markup, locale=settings.WIKI_DEFAULT_LANGUAGE,
  24. doc_id=None, parser_cls=None):
  25. """Wiki Markup -> HTML with the wiki app's enhanced parser"""
  26. if parser_cls is None:
  27. parser_cls = WikiParser
  28. with statsd.timer('wiki.render'):
  29. with uselocale(locale):
  30. content = parser_cls(doc_id=doc_id).parse(
  31. wiki_markup, show_toc=False, locale=locale,
  32. toc_string=_('Table of Contents'))
  33. return content
  34. def _format_template_content(content, params):
  35. """Formats a template's content using passed in arguments"""
  36. def arg_replace(matchobj):
  37. """Takes a regex matching {{{name}} and returns params['name']"""
  38. param_name = matchobj.group(1)
  39. if param_name in params:
  40. return params[param_name]
  41. return TEMPLATE_ARG_REGEX.sub(arg_replace, content)
  42. def _build_template_params(params_str):
  43. """Builds a dictionary from a given list of raw strings passed in by the
  44. user.
  45. Example syntax it handles:
  46. * ['one', 'two'] turns into {1: 'one', 2: 'two'}
  47. * ['12=blah'] turns into {12: 'blah'}
  48. * ['name=value'] turns into {'name': 'value'}
  49. """
  50. i = 0
  51. params = {}
  52. for item in params_str:
  53. param, __, value = item.partition('=')
  54. if value:
  55. params[param] = value
  56. else:
  57. i = i + 1
  58. params[str(i)] = param
  59. return params
  60. # Custom syntax using regexes follows below.
  61. # * turn tags of the form {tag content} into <span class="tag">content</span>
  62. # * expand {key ctrl+alt} into <span class="key">ctrl</span> +
  63. # <span class="key">alt</span>
  64. # * turn {note}note{/note} into <div class="note">a note</div>
  65. def _key_split(matchobj):
  66. """Expands a {key a+b+c} syntax into <span class="key">a</span> + ...
  67. More explicitly, it takes a regex matching {key ctrl+alt+del} and returns:
  68. <span class="key">ctrl</span> + <span class="key">alt</span> +
  69. <span class="key">del</span>
  70. """
  71. keys = [k.strip() for k in matchobj.group(1).split('+')]
  72. return ' + '.join(['<span class="key">%s</span>' % key for key in keys])
  73. PATTERNS = [
  74. (re.compile(pattern, re.DOTALL), replacement) for
  75. pattern, replacement in (
  76. # (x, y), replace x with y
  77. (r'{(?P<name>note|warning)}', '<div class="\g<name>">'),
  78. (r'\{/(note|warning)\}', '</div>'),
  79. # To use } as a key, this syntax won't work. Use [[T:key|}]] instead
  80. (r'\{key (.+?)\}', _key_split), # ungreedy: stop at the first }
  81. (r'{(?P<name>button|menu|filepath|pref) (?P<content>.*?)}',
  82. '<span class="\g<name>">\g<content></span>'),
  83. )]
  84. def parse_simple_syntax(text):
  85. for pattern, replacement in PATTERNS:
  86. text = pattern.sub(replacement, text)
  87. return text
  88. class ForParser(object):
  89. """HTML 5 parser which finds <for> tags and translates them into spans and
  90. divs having the proper data- elements and classes.
  91. As a side effect, repairs poorly matched pairings of <for> (and other
  92. tags), probably in favor of the location of the opening tag.
  93. """
  94. TREEBUILDER = 'lxml'
  95. CONTAINER_TAG = 'div'
  96. def __init__(self, html):
  97. """Create a parse tree from the given HTML."""
  98. def really_parse_fragment(parser, html):
  99. """Parse a possibly multi-rooted HTML fragment, wrapping it in a
  100. <div> to make it easy to query later.
  101. As far as I can tell, this is what parseFragment is supposed to do
  102. (but doesn't). See
  103. http://code.google.com/p/html5lib/issues/detail?id=161.
  104. """
  105. top_level_elements = parser.parseFragment(html)
  106. container = Element(self.CONTAINER_TAG)
  107. # Why lxml couldn't just have text nodes, I'll never understand.
  108. # Text nodes that come other than first are automatically stuffed
  109. # into the tail attrs of the preceding elements by html5lib.
  110. if top_level_elements and isinstance(top_level_elements[0],
  111. basestring):
  112. container.text = top_level_elements.pop(0)
  113. container.extend(top_level_elements)
  114. return container
  115. p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER))
  116. self._root = really_parse_fragment(p, html)
  117. def expand_fors(self):
  118. """Turn the for tags into spans and divs, and apply data attrs.
  119. If a for contains any block-level elements, it turns into a div.
  120. Otherwise, it turns into a span.
  121. """
  122. html_ns = 'http://www.w3.org/1999/xhtml'
  123. for for_el in self._root.xpath('//html:for',
  124. namespaces={'html': html_ns}):
  125. for_el.tag = ('div' if any(for_el.find('{' + html_ns + '}' + tag)
  126. is not None
  127. for tag in BLOCK_LEVEL_ELEMENTS)
  128. else 'span')
  129. for_el.attrib['class'] = 'for'
  130. def to_unicode(self):
  131. """Return the unicode serialization of myself."""
  132. container_len = len(self.CONTAINER_TAG) + 2 # 2 for the <>
  133. walker = getTreeWalker(self.TREEBUILDER)
  134. stream = walker(self._root)
  135. serializer = HTMLSerializer(quote_attr_values=True,
  136. omit_optional_tags=False)
  137. return serializer.render(stream)[container_len:-container_len - 1]
  138. @staticmethod
  139. def _on_own_line(match, postspace):
  140. """Return (whether the tag is on its own line, whether the tag is at
  141. the very top of the string, whether the tag is at the very bottom of
  142. the string).
  143. Tolerates whitespace to the right of the tag: a tag with trailing
  144. whitespace on the line can still be considered to be on its own line.
  145. """
  146. pos_before_tag = match.start(2) - 1
  147. if pos_before_tag >= 0:
  148. at_left = match.string[pos_before_tag] == '\n'
  149. at_top = False
  150. else:
  151. at_left = at_top = True
  152. at_bottom_modulo_space = match.end(4) == len(match.string)
  153. at_right_modulo_space = at_bottom_modulo_space or '\n' in postspace
  154. return (at_left and at_right_modulo_space,
  155. at_top, at_bottom_modulo_space)
  156. @staticmethod
  157. def _wiki_to_tag(attrs):
  158. """Turn {for ...} into <for data-for="...">."""
  159. if not attrs:
  160. return '<for>'
  161. # Strip leading and trailing whitespace from each value for easier
  162. # matching in the JS:
  163. stripped = ','.join([x.strip() for x in attrs.split(',')])
  164. return '<for data-for=' + quoteattr(stripped) + '>'
  165. _FOR_OR_CLOSER = re.compile(r'(\s*)'
  166. r'(\{for(?: +([^\}]*))?\}|{/for})'
  167. r'(\s*)', re.MULTILINE)
  168. @classmethod
  169. def strip_fors(cls, text):
  170. """Replace each {for} or {/for} tag with a unique token the
  171. wiki formatter will treat as inline.
  172. Return (stripped text,
  173. dehydrated fors for use with unstrip_fors).
  174. """
  175. # "attributes" of {for a, b} directives, like "a, b" keyed for token
  176. # number
  177. dehydrations = {}
  178. indexes = count()
  179. def dehydrate(match):
  180. """Close over `dehydrations`, sock the {for}s away therein, and
  181. replace {for}s and {/for}s with tokens."""
  182. def paragraph_padding(str):
  183. """If str doesn't contain at least 2 newlines, return enough
  184. such that appending them will cause it to."""
  185. return '\n' * max(2 - str.count('\n'), 0)
  186. def preceding_whitespace(str, pos):
  187. """Return all contiguous whitespace preceding str[pos]."""
  188. whitespace = []
  189. for i in xrange(pos - 1, 0, -1):
  190. if str[i] in '\t \n\r':
  191. whitespace.append(str[i])
  192. else:
  193. break
  194. whitespace.reverse()
  195. return ''.join(whitespace)
  196. prespace, tag, attrs, postspace = match.groups()
  197. if tag != '{/for}':
  198. i = indexes.next()
  199. dehydrations[i] = cls._wiki_to_tag(attrs)
  200. token = u'\x07%i\x07' % i
  201. else:
  202. token = u'\x07/sf\x07'
  203. # If the {for} or {/for} is on a line by itself (righthand
  204. # whitespace is allowed; left would indicate a <pre>), make sure it
  205. # has enough newlines on each side to make it its own paragraph,
  206. # lest it get sucked into being part of the next or previous
  207. # paragraph:
  208. on_own_line, at_top, at_bottom = cls._on_own_line(match, postspace)
  209. if on_own_line:
  210. # If tag (excluding leading whitespace) wasn't at top of
  211. # document, space it off from preceding block elements:
  212. if not at_top:
  213. # If there are already enough \ns before the tag to
  214. # distance it from the preceding paragraph, take them into
  215. # account before adding more.
  216. prespace += paragraph_padding(
  217. preceding_whitespace(match.string, match.start(1)) + prespace)
  218. # If tag (including trailing whitespace) wasn't at the bottom
  219. # of the document, space it off from following block elements:
  220. if not at_bottom:
  221. postspace += paragraph_padding(postspace)
  222. return prespace + token + postspace
  223. # Do single replaces over and over, taking into account the effects of
  224. # previous ones so that whitespace added in a previous replacement can
  225. # be considered for its role in helping to nudge an adjacent block-
  226. # level {for} into its own paragraph. There's no pos arg to replace(),
  227. # so we had to write our own.
  228. pos = 0
  229. while True:
  230. m = cls._FOR_OR_CLOSER.search(text, pos)
  231. if m is None:
  232. return text, dehydrations
  233. done = text[:m.start()] + dehydrate(m) # already been searched
  234. pos = len(done)
  235. text = done + text[m.end():]
  236. # Dratted wiki formatter likes to put <p> tags around my token when it sits
  237. # on a line by itself, so tolerate and consume that foolishness:
  238. _PARSED_STRIPPED_FOR = re.compile(
  239. # Whitespace, a {for} token, then more whitespace (including <br>s):
  240. r'<p>'
  241. r'(?:\s|<br\s*/?>)*'
  242. r'\x07(\d+)\x07' # The {for} token
  243. r'(?:\s|<br\s*/?>)*'
  244. r'</p>'
  245. # Alternately, a lone {for} token that didn't get wrapped in a <p>:
  246. r'|\x07(\d+)\x07')
  247. _PARSED_STRIPPED_FOR_CLOSER = re.compile(
  248. # Similar to above, a {/for} token wrapped in <p> and whitespace:
  249. r'<p>'
  250. r'(?:\s|<br\s*/?>)*'
  251. r'\x07/sf\x07' # {/for} token
  252. r'(?:\s|<br\s*/?>)*'
  253. r'</p>'
  254. # Or a lone {/for} token:
  255. r'|\x07/sf\x07')
  256. @classmethod
  257. def unstrip_fors(cls, html, dehydrations):
  258. """Replace the tokens with <for> tags the ForParser understands."""
  259. def hydrate(match):
  260. return dehydrations.get(int(match.group(1) or match.group(2)), '')
  261. # Put <for ...> tags back in:
  262. html = cls._PARSED_STRIPPED_FOR.sub(hydrate, html)
  263. # Replace {/for} tags:
  264. return cls._PARSED_STRIPPED_FOR_CLOSER.sub(u'</for>', html)
  265. # L10n: This error is displayed if a template is included into itself.
  266. RECURSION_MESSAGE = _lazy(u'[Recursive inclusion of "%s"]')
  267. class WikiParser(sumo_parser.WikiParser):
  268. """An extension of the parser from the forums adding more crazy features
  269. {for} tags, inclusions, and templates--oh my!
  270. """
  271. image_template = 'wikiparser/hook_image_lazy.html'
  272. def __init__(self, base_url=None, doc_id=None):
  273. """
  274. doc_id -- If you want to be nice, pass the ID of the Document you are
  275. rendering. This will make recursive inclusions fail immediately
  276. rather than after the first round of recursion.
  277. """
  278. super(WikiParser, self).__init__(base_url)
  279. # Stack of document IDs to prevent Include or Template recursion:
  280. self.inclusions = [doc_id] if doc_id else []
  281. # The wiki has additional hooks not used elsewhere
  282. self.registerInternalLinkHook('Include', self._hook_include)
  283. self.registerInternalLinkHook('I', self._hook_include)
  284. self.registerInternalLinkHook('Template', self._hook_template)
  285. self.registerInternalLinkHook('T', self._hook_template)
  286. def parse(self, text, **kwargs):
  287. """Wrap SUMO's parse() to support additional wiki-only features."""
  288. # Replace fors with inline tokens the wiki formatter will tolerate:
  289. text, data = ForParser.strip_fors(text)
  290. # Do simple substitutions:
  291. text = parse_simple_syntax(text)
  292. # Run the formatter:
  293. html = super(WikiParser, self).parse(
  294. text, youtube_embeds=False, **kwargs)
  295. # Put the fors back in (as XML-ish <for> tags this time):
  296. html = ForParser.unstrip_fors(html, data)
  297. # Balance badly paired <for> tags:
  298. for_parser = ForParser(html)
  299. # Convert them to spans and divs:
  300. for_parser.expand_fors()
  301. html = for_parser.to_unicode()
  302. html = self.add_youtube_embeds(html)
  303. return html
  304. def _hook_include(self, parser, space, title):
  305. """Returns the document's parsed content."""
  306. message = _('The document "%s" does not exist.') % title
  307. include = get_object_fallback(Document, title, locale=self.locale)
  308. if not include or not include.current_revision:
  309. return message
  310. if include.id in parser.inclusions:
  311. return RECURSION_MESSAGE % title
  312. else:
  313. parser.inclusions.append(include.id)
  314. ret = parser.parse(include.current_revision.content, show_toc=False,
  315. locale=self.locale)
  316. parser.inclusions.pop()
  317. return ret
  318. # Wiki templates are documents that receive arguments.
  319. #
  320. # They can be useful when including similar content in multiple places,
  321. # with slight variations. For examples and details see:
  322. # http://www.mediawiki.org/wiki/Help:Templates
  323. #
  324. def _hook_template(self, parser, space, title):
  325. """Handles Template:Template name, formatting the content using given
  326. args"""
  327. params = title.split('|')
  328. short_title = params.pop(0)
  329. template_title = 'Template:' + short_title
  330. message = _('The template "%s" does not exist or has no approved '
  331. 'revision.') % short_title
  332. template = get_object_fallback(Document, template_title,
  333. locale=self.locale, is_template=True)
  334. if not template or not template.current_revision:
  335. return message
  336. if template.id in parser.inclusions:
  337. return RECURSION_MESSAGE % template_title
  338. else:
  339. parser.inclusions.append(template.id)
  340. c = template.current_revision.content.rstrip()
  341. # Note: this completely ignores the allowed attributes passed to the
  342. # WikiParser.parse() method and defaults to ALLOWED_ATTRIBUTES.
  343. parsed = parser.parse(c, show_toc=False, attributes=ALLOWED_ATTRIBUTES,
  344. locale=self.locale)
  345. parser.inclusions.pop()
  346. # Special case for inline templates
  347. if '\n' not in c:
  348. parsed = parsed.replace('<p>', '')
  349. parsed = parsed.replace('</p>', '')
  350. # Do some string formatting to replace parameters
  351. return _format_template_content(parsed, _build_template_params(params))
  352. class WhatLinksHereParser(WikiParser):
  353. """An extension of the wiki that deals with what links here data."""
  354. def __init__(self, doc_id, **kwargs):
  355. self.current_doc = Document.objects.get(pk=doc_id)
  356. return (super(WhatLinksHereParser, self)
  357. .__init__(doc_id=doc_id, **kwargs))
  358. def _hook_internal_link(self, parser, space, name):
  359. """Records links between documents, and then calls super()."""
  360. title = name.split('|')[0]
  361. locale = self.current_doc.locale
  362. linked_doc = get_object_fallback(Document, title, locale)
  363. if linked_doc is not None:
  364. self.current_doc.add_link_to(linked_doc, 'link')
  365. return (super(WhatLinksHereParser, self)
  366. ._hook_internal_link(parser, space, name))
  367. def _hook_template(self, parser, space, name):
  368. """Record a template link between documents, and then call super()."""
  369. params = name.split('|')
  370. template = get_object_fallback(Document, 'Template:' + params[0],
  371. locale=self.locale, is_template=True)
  372. if template:
  373. self.current_doc.add_link_to(template, 'template')
  374. return (super(WhatLinksHereParser, self)
  375. ._hook_template(parser, space, name))
  376. def _hook_include(self, parser, space, name):
  377. """Record an include link between documents, and then call super()."""
  378. include = get_object_fallback(Document, name, locale=self.locale)
  379. if include:
  380. self.current_doc.add_link_to(include, 'include')
  381. return (super(WhatLinksHereParser, self)
  382. ._hook_include(parser, space, name))
  383. def _hook_image_tag(self, parser, space, name):
  384. """Record an image is included in a document, then call super()."""
  385. title = name.split('|')[0]
  386. image = get_object_fallback(Image, title, self.locale)
  387. if image:
  388. self.current_doc.add_image(image)
  389. return (super(WhatLinksHereParser, self)
  390. ._hook_image_tag(parser, space, name))