parser.py | searchcode

/kitsune/wiki/parser.py

https://gitlab.com/Guy1394/kitsune
Python | 495 lines | 487 code | 7 blank | 1 comment | 0 complexity | 7ece2d52d3b0900e48519bd07a86ff5d MD5 | raw file

import re
from itertools import count
from xml.sax.saxutils import quoteattr

from django.conf import settings

from html5lib import HTMLParser
from html5lib.serializer.htmlserializer import HTMLSerializer
from html5lib.treebuilders import getTreeBuilder
from html5lib.treewalkers import getTreeWalker
from lxml.etree import Element
from statsd import statsd
from django.utils.translation import ugettext as _, ugettext_lazy as _lazy

from kitsune.gallery.models import Image
from kitsune.sumo import parser as sumo_parser
from kitsune.sumo.parser import ALLOWED_ATTRIBUTES, get_object_fallback
from kitsune.sumo.utils import uselocale
from kitsune.wiki.models import Document


# block elements wikimarkup knows about (and thus preserves)
BLOCK_LEVEL_ELEMENTS = ['table', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5',
                        'h6', 'td', 'th', 'div', 'hr', 'pre', 'p', 'li', 'ul',
                        'ol', 'center', 'dl', 'dt', 'dd', 'ins', 'del',
                        'section']
TEMPLATE_ARG_REGEX = re.compile('{{{([^{]+?)}}}')


def wiki_to_html(wiki_markup, locale=settings.WIKI_DEFAULT_LANGUAGE,
                 doc_id=None, parser_cls=None):
    """Wiki Markup -> HTML with the wiki app's enhanced parser"""
    if parser_cls is None:
        parser_cls = WikiParser

    with statsd.timer('wiki.render'):
        with uselocale(locale):
            content = parser_cls(doc_id=doc_id).parse(
                wiki_markup, show_toc=False, locale=locale,
                toc_string=_('Table of Contents'))
    return content


def _format_template_content(content, params):
    """Formats a template's content using passed in arguments"""

    def arg_replace(matchobj):
        """Takes a regex matching {{{name}} and returns params['name']"""
        param_name = matchobj.group(1)
        if param_name in params:
            return params[param_name]

    return TEMPLATE_ARG_REGEX.sub(arg_replace, content)


def _build_template_params(params_str):
    """Builds a dictionary from a given list of raw strings passed in by the
    user.

    Example syntax it handles:
    * ['one', 'two']   turns into     {1: 'one', 2: 'two'}
    * ['12=blah']      turns into     {12: 'blah'}
    * ['name=value']   turns into     {'name': 'value'}

    """
    i = 0
    params = {}
    for item in params_str:
        param, __, value = item.partition('=')
        if value:
            params[param] = value
        else:
            i = i + 1
            params[str(i)] = param
    return params


# Custom syntax using regexes follows below.
# * turn tags of the form {tag content} into <span class="tag">content</span>
# * expand {key ctrl+alt} into <span class="key">ctrl</span> +
#   <span class="key">alt</span>
# * turn {note}note{/note} into <div class="note">a note</div>

def _key_split(matchobj):
    """Expands a {key a+b+c} syntax into <span class="key">a</span> + ...

    More explicitly, it takes a regex matching {key ctrl+alt+del} and returns:
    <span class="key">ctrl</span> + <span class="key">alt</span> +
    <span class="key">del</span>

    """
    keys = [k.strip() for k in matchobj.group(1).split('+')]
    return ' + '.join(['<span class="key">%s</span>' % key for key in keys])


PATTERNS = [
    (re.compile(pattern, re.DOTALL), replacement) for
    pattern, replacement in (
        # (x, y), replace x with y
        (r'{(?P<name>note|warning)}', '<div class="\g<name>">'),
        (r'\{/(note|warning)\}', '</div>'),
        # To use } as a key, this syntax won't work. Use [[T:key|}]] instead
        (r'\{key (.+?)\}', _key_split),  # ungreedy: stop at the first }
        (r'{(?P<name>button|menu|filepath|pref) (?P<content>.*?)}',
         '<span class="\g<name>">\g<content></span>'),
    )]


def parse_simple_syntax(text):
    for pattern, replacement in PATTERNS:
        text = pattern.sub(replacement, text)
    return text


class ForParser(object):
    """HTML 5 parser which finds <for> tags and translates them into spans and
    divs having the proper data- elements and classes.

    As a side effect, repairs poorly matched pairings of <for> (and other
    tags), probably in favor of the location of the opening tag.

    """
    TREEBUILDER = 'lxml'
    CONTAINER_TAG = 'div'

    def __init__(self, html):
        """Create a parse tree from the given HTML."""
        def really_parse_fragment(parser, html):
            """Parse a possibly multi-rooted HTML fragment, wrapping it in a
            <div> to make it easy to query later.

            As far as I can tell, this is what parseFragment is supposed to do
            (but doesn't). See
            http://code.google.com/p/html5lib/issues/detail?id=161.

            """
            top_level_elements = parser.parseFragment(html)
            container = Element(self.CONTAINER_TAG)

            # Why lxml couldn't just have text nodes, I'll never understand.
            # Text nodes that come other than first are automatically stuffed
            # into the tail attrs of the preceding elements by html5lib.
            if top_level_elements and isinstance(top_level_elements[0],
                                                 basestring):
                container.text = top_level_elements.pop(0)

            container.extend(top_level_elements)
            return container

        p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER))
        self._root = really_parse_fragment(p, html)

    def expand_fors(self):
        """Turn the for tags into spans and divs, and apply data attrs.

        If a for contains any block-level elements, it turns into a div.
        Otherwise, it turns into a span.

        """
        html_ns = 'http://www.w3.org/1999/xhtml'
        for for_el in self._root.xpath('//html:for',
                                       namespaces={'html': html_ns}):
            for_el.tag = ('div' if any(for_el.find('{' + html_ns + '}' + tag)
                                       is not None
                                       for tag in BLOCK_LEVEL_ELEMENTS)
                          else 'span')
            for_el.attrib['class'] = 'for'

    def to_unicode(self):
        """Return the unicode serialization of myself."""
        container_len = len(self.CONTAINER_TAG) + 2  # 2 for the <>
        walker = getTreeWalker(self.TREEBUILDER)
        stream = walker(self._root)
        serializer = HTMLSerializer(quote_attr_values=True,
                                    omit_optional_tags=False)
        return serializer.render(stream)[container_len:-container_len - 1]

    @staticmethod
    def _on_own_line(match, postspace):
        """Return (whether the tag is on its own line, whether the tag is at
        the very top of the string, whether the tag is at the very bottom of
        the string).

        Tolerates whitespace to the right of the tag: a tag with trailing
        whitespace on the line can still be considered to be on its own line.

        """
        pos_before_tag = match.start(2) - 1
        if pos_before_tag >= 0:
            at_left = match.string[pos_before_tag] == '\n'
            at_top = False
        else:
            at_left = at_top = True
        at_bottom_modulo_space = match.end(4) == len(match.string)
        at_right_modulo_space = at_bottom_modulo_space or '\n' in postspace
        return (at_left and at_right_modulo_space,
                at_top, at_bottom_modulo_space)

    @staticmethod
    def _wiki_to_tag(attrs):
        """Turn {for ...} into <for data-for="...">."""
        if not attrs:
            return '<for>'
        # Strip leading and trailing whitespace from each value for easier
        # matching in the JS:
        stripped = ','.join([x.strip() for x in attrs.split(',')])
        return '<for data-for=' + quoteattr(stripped) + '>'

    _FOR_OR_CLOSER = re.compile(r'(\s*)'
                                r'(\{for(?: +([^\}]*))?\}|{/for})'
                                r'(\s*)', re.MULTILINE)

    @classmethod
    def strip_fors(cls, text):
        """Replace each {for} or {/for} tag with a unique token the
        wiki formatter will treat as inline.

        Return (stripped text,
                dehydrated fors for use with unstrip_fors).

        """
        # "attributes" of {for a, b} directives, like "a, b" keyed for token
        # number
        dehydrations = {}
        indexes = count()

        def dehydrate(match):
            """Close over `dehydrations`, sock the {for}s away therein, and
            replace {for}s and {/for}s with tokens."""
            def paragraph_padding(str):
                """If str doesn't contain at least 2 newlines, return enough
                such that appending them will cause it to."""
                return '\n' * max(2 - str.count('\n'), 0)

            def preceding_whitespace(str, pos):
                """Return all contiguous whitespace preceding str[pos]."""
                whitespace = []
                for i in xrange(pos - 1, 0, -1):
                    if str[i] in '\t \n\r':
                        whitespace.append(str[i])
                    else:
                        break
                whitespace.reverse()
                return ''.join(whitespace)

            prespace, tag, attrs, postspace = match.groups()

            if tag != '{/for}':
                i = indexes.next()
                dehydrations[i] = cls._wiki_to_tag(attrs)
                token = u'\x07%i\x07' % i
            else:
                token = u'\x07/sf\x07'

            # If the {for} or {/for} is on a line by itself (righthand
            # whitespace is allowed; left would indicate a <pre>), make sure it
            # has enough newlines on each side to make it its own paragraph,
            # lest it get sucked into being part of the next or previous
            # paragraph:
            on_own_line, at_top, at_bottom = cls._on_own_line(match, postspace)
            if on_own_line:
                # If tag (excluding leading whitespace) wasn't at top of
                # document, space it off from preceding block elements:
                if not at_top:
                    # If there are already enough \ns before the tag to
                    # distance it from the preceding paragraph, take them into
                    # account before adding more.
                    prespace += paragraph_padding(
                        preceding_whitespace(match.string, match.start(1)) + prespace)

                # If tag (including trailing whitespace) wasn't at the bottom
                # of the document, space it off from following block elements:
                if not at_bottom:
                    postspace += paragraph_padding(postspace)

            return prespace + token + postspace

        # Do single replaces over and over, taking into account the effects of
        # previous ones so that whitespace added in a previous replacement can
        # be considered for its role in helping to nudge an adjacent block-
        # level {for} into its own paragraph. There's no pos arg to replace(),
        # so we had to write our own.
        pos = 0
        while True:
            m = cls._FOR_OR_CLOSER.search(text, pos)
            if m is None:
                return text, dehydrations
            done = text[:m.start()] + dehydrate(m)  # already been searched
            pos = len(done)
            text = done + text[m.end():]

    # Dratted wiki formatter likes to put <p> tags around my token when it sits
    # on a line by itself, so tolerate and consume that foolishness:
    _PARSED_STRIPPED_FOR = re.compile(
        # Whitespace, a {for} token, then more whitespace (including <br>s):
        r'<p>'
        r'(?:\s|<br\s*/?>)*'
        r'\x07(\d+)\x07'  # The {for} token
        r'(?:\s|<br\s*/?>)*'
        r'</p>'
        # Alternately, a lone {for} token that didn't get wrapped in a <p>:
        r'|\x07(\d+)\x07')
    _PARSED_STRIPPED_FOR_CLOSER = re.compile(
        # Similar to above, a {/for} token wrapped in <p> and whitespace:
        r'<p>'
        r'(?:\s|<br\s*/?>)*'
        r'\x07/sf\x07'  # {/for} token
        r'(?:\s|<br\s*/?>)*'
        r'</p>'
        # Or a lone {/for} token:
        r'|\x07/sf\x07')

    @classmethod
    def unstrip_fors(cls, html, dehydrations):
        """Replace the tokens with <for> tags the ForParser understands."""
        def hydrate(match):
            return dehydrations.get(int(match.group(1) or match.group(2)), '')

        # Put <for ...> tags back in:
        html = cls._PARSED_STRIPPED_FOR.sub(hydrate, html)

        # Replace {/for} tags:
        return cls._PARSED_STRIPPED_FOR_CLOSER.sub(u'</for>', html)


# L10n: This error is displayed if a template is included into itself.
RECURSION_MESSAGE = _lazy(u'[Recursive inclusion of "%s"]')


class WikiParser(sumo_parser.WikiParser):
    """An extension of the parser from the forums adding more crazy features

    {for} tags, inclusions, and templates--oh my!

    """

    image_template = 'wikiparser/hook_image_lazy.html'

    def __init__(self, base_url=None, doc_id=None):
        """
        doc_id -- If you want to be nice, pass the ID of the Document you are
            rendering. This will make recursive inclusions fail immediately
            rather than after the first round of recursion.

        """
        super(WikiParser, self).__init__(base_url)

        # Stack of document IDs to prevent Include or Template recursion:
        self.inclusions = [doc_id] if doc_id else []

        # The wiki has additional hooks not used elsewhere
        self.registerInternalLinkHook('Include', self._hook_include)
        self.registerInternalLinkHook('I', self._hook_include)
        self.registerInternalLinkHook('Template', self._hook_template)
        self.registerInternalLinkHook('T', self._hook_template)

    def parse(self, text, **kwargs):
        """Wrap SUMO's parse() to support additional wiki-only features."""

        # Replace fors with inline tokens the wiki formatter will tolerate:
        text, data = ForParser.strip_fors(text)

        # Do simple substitutions:
        text = parse_simple_syntax(text)

        # Run the formatter:
        html = super(WikiParser, self).parse(
            text, youtube_embeds=False, **kwargs)

        # Put the fors back in (as XML-ish <for> tags this time):
        html = ForParser.unstrip_fors(html, data)

        # Balance badly paired <for> tags:
        for_parser = ForParser(html)

        # Convert them to spans and divs:
        for_parser.expand_fors()

        html = for_parser.to_unicode()

        html = self.add_youtube_embeds(html)

        return html

    def _hook_include(self, parser, space, title):
        """Returns the document's parsed content."""
        message = _('The document "%s" does not exist.') % title
        include = get_object_fallback(Document, title, locale=self.locale)
        if not include or not include.current_revision:
            return message

        if include.id in parser.inclusions:
            return RECURSION_MESSAGE % title
        else:
            parser.inclusions.append(include.id)
        ret = parser.parse(include.current_revision.content, show_toc=False,
                           locale=self.locale)
        parser.inclusions.pop()

        return ret

    # Wiki templates are documents that receive arguments.
    #
    # They can be useful when including similar content in multiple places,
    # with slight variations. For examples and details see:
    # http://www.mediawiki.org/wiki/Help:Templates
    #
    def _hook_template(self, parser, space, title):
        """Handles Template:Template name, formatting the content using given
        args"""
        params = title.split('|')
        short_title = params.pop(0)
        template_title = 'Template:' + short_title

        message = _('The template "%s" does not exist or has no approved '
                    'revision.') % short_title
        template = get_object_fallback(Document, template_title,
                                       locale=self.locale, is_template=True)

        if not template or not template.current_revision:
            return message

        if template.id in parser.inclusions:
            return RECURSION_MESSAGE % template_title
        else:
            parser.inclusions.append(template.id)
        c = template.current_revision.content.rstrip()
        # Note: this completely ignores the allowed attributes passed to the
        # WikiParser.parse() method and defaults to ALLOWED_ATTRIBUTES.
        parsed = parser.parse(c, show_toc=False, attributes=ALLOWED_ATTRIBUTES,
                              locale=self.locale)
        parser.inclusions.pop()

        # Special case for inline templates
        if '\n' not in c:
            parsed = parsed.replace('<p>', '')
            parsed = parsed.replace('</p>', '')
        # Do some string formatting to replace parameters
        return _format_template_content(parsed, _build_template_params(params))


class WhatLinksHereParser(WikiParser):
    """An extension of the wiki that deals with what links here data."""

    def __init__(self, doc_id, **kwargs):
        self.current_doc = Document.objects.get(pk=doc_id)
        return (super(WhatLinksHereParser, self)
                .__init__(doc_id=doc_id, **kwargs))

    def _hook_internal_link(self, parser, space, name):
        """Records links between documents, and then calls super()."""

        title = name.split('|')[0]
        locale = self.current_doc.locale

        linked_doc = get_object_fallback(Document, title, locale)
        if linked_doc is not None:
            self.current_doc.add_link_to(linked_doc, 'link')

        return (super(WhatLinksHereParser, self)
                ._hook_internal_link(parser, space, name))

    def _hook_template(self, parser, space, name):
        """Record a template link between documents, and then call super()."""

        params = name.split('|')
        template = get_object_fallback(Document, 'Template:' + params[0],
                                       locale=self.locale, is_template=True)

        if template:
            self.current_doc.add_link_to(template, 'template')

        return (super(WhatLinksHereParser, self)
                ._hook_template(parser, space, name))

    def _hook_include(self, parser, space, name):
        """Record an include link between documents, and then call super()."""
        include = get_object_fallback(Document, name, locale=self.locale)

        if include:
            self.current_doc.add_link_to(include, 'include')

        return (super(WhatLinksHereParser, self)
                ._hook_include(parser, space, name))

    def _hook_image_tag(self, parser, space, name):
        """Record an image is included in a document, then call super()."""
        title = name.split('|')[0]
        image = get_object_fallback(Image, title, self.locale)

        if image:
            self.current_doc.add_image(image)

        return (super(WhatLinksHereParser, self)
                ._hook_image_tag(parser, space, name))