latex_lexer.py | searchcode

/2013spring/exts/sphinxcontrib/bibtex/latex_lexer.py

https://bitbucket.org/chiamingyen/mdecourse
Python | 322 lines | 235 code | 15 blank | 72 comment | 25 complexity | b5a78361b5d6309698652bd62c72e6bc MD5 | raw file

# -*- coding: utf-8 -*-
"""
    Simple incremental latex lexer
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""

import codecs
import collections
import re

class Token(collections.namedtuple("Token", "name text")):
    """Stores information about a matched token."""
    __slots__ = () # efficiency

    def __new__(cls, name=None, text=None):
        return tuple.__new__(
            cls,
            (name if name is not None else 'unknown',
             text if text is not None else b''))

    def __nonzero__(self):
        return bool(self.text)

    def __len__(self):
        return len(self.text)

    def decode(self, encoding):
        if self.name == 'control_word':
            return self.text.decode(encoding) + u' '
        else:
            return self.text.decode(encoding)

# implementation note: we derive from IncrementalDecoder because this
# class serves excellently as a base class for incremental decoders,
# but of course we don't decode yet until later
class LatexLexer(codecs.IncrementalDecoder):
    """A very simple lexer for tex/latex code."""

    # implementation note: every token **must** be decodable by inputenc
    tokens = [
        # comment: for ease, and for speed, we handle it as a token
        ('comment', br'%.*?\n'),
        # control tokens
        # in latex, some control tokens skip following whitespace
        # ('control-word' and 'control-symbol')
        # others do not ('control-symbol-x')
        # XXX TBT says no control symbols skip whitespace (except '\ ')
        # XXX but tests reveal otherwise?
        ('control_word', br'[\\][a-zA-Z]+'),
        ('control_symbol', br'[\\][~' br"'" br'"` =^!]'),
        ('control_symbol_x', br'[\\][^a-zA-Z]'), # TODO should only match ascii
        # parameter tokens
        # also support a lone hash so we can lex things like b'#a'
        ('parameter', br'\#[0-9]|\#'),
        # any remaining characters; for ease we also handle space and
        # newline as tokens
        ('space', br' '),
        ('newline', br'\n'),
        ('mathshift', br'[$]'),
        # note: some chars joined together to make it easier to detect
        # symbols that have a special function (i.e. --, ---, etc.)
        ('chars',
         br'---|--|-|[`][`]'
         br"|['][']"
         br'|[?][`]|[!][`]'
         # separate chars because brackets are optional
         # e.g. fran\\c cais = fran\\c{c}ais in latex
         # so only way to detect \\c acting on c only is this way
         br'|[0-9a-zA-Z{}]'
         # we have to join everything else together to support
         # multibyte encodings: every token must be decodable!!
         # this means for instance that \\c öké is NOT equivalent to
         # \\c{ö}ké
         br'|[^ %#$\n\\]+'),
        # trailing garbage which we cannot decode otherwise
        # (such as a lone '\' at the end of a buffer)
        # is never emitted, but used internally by the buffer
        ('unknown', br'.'),
        ]

    def __init__(self, errors='strict'):
        """Initialize the codec."""
        self.errors = errors
        # regular expression used for matching
        self.regexp = re.compile(
            b"|".join(
                b"(?P<" + name.encode() + b">" + regexp + b")"
                for name, regexp in self.tokens),
            re.DOTALL)
        # reset state
        self.reset()

    def reset(self):
        """Reset state (also called by __init__ to initialize the
        state).
        """
        # buffer for storing last (possibly incomplete) token
        self.raw_buffer = Token()

    def getstate(self):
        return (self.raw_buffer.text, 0)

    def setstate(self, state):
        self.raw_buffer = Token('unknown', state[0])

    def get_raw_tokens(self, bytes_, final=False):
        """Yield tokens without any further processing. Tokens are one of:

        - ``\\<word>``: a control word (i.e. a command)
        - ``\\<symbol>``: a control symbol (i.e. \\^ etc.)
        - ``#<n>``: a parameter
        - a series of byte characters
        """
        if not isinstance(bytes_, bytes):
            raise TypeError(
                'expected bytes but got %s'
                % bytes_.__class__.__name__)
        if self.raw_buffer:
            bytes_ = self.raw_buffer.text + bytes_
        self.raw_buffer = Token()
        for match in self.regexp.finditer(bytes_):
            for name, regexp in self.tokens:
                text = match.group(name)
                if text is not None:
                    # yield the buffer token(s)
                    for token in self.flush_raw_tokens():
                        yield token
                    # fill buffer with next token
                    self.raw_buffer = Token(name, text)
                    break
            else:
                # should not happen
                raise AssertionError("lexer failed on '%s'" % bytes_)
        if final:
            for token in self.flush_raw_tokens():
                yield token

    def flush_raw_tokens(self):
        """Flush the raw token buffer."""
        if self.raw_buffer:
            yield self.raw_buffer
            self.raw_buffer = Token()

class LatexIncrementalLexer(LatexLexer):
    """A very simple incremental lexer for tex/latex code. Roughly
    follows the state machine described in Tex By Topic, Chapter 2.

    The generated tokens satisfy:

    * no newline characters: paragraphs are separated by '\\par'
    * spaces following control tokens are compressed
    """

    def reset(self):
        """Reset state (also called by __init__ to initialize the
        state).
        """
        LatexLexer.reset(self)
        # three possible states:
        # newline (N), skipping spaces (S), and middle of line (M)
        self.state = 'N'
        # inline math mode?
        self.inline_math = False

    def getstate(self):
        # state 'M' is most common, so let that be zero
        return (
            self.raw_buffer,
            {'M': 0, 'N': 1, 'S': 2}[self.state]
            | (4 if self.inline_math else 0)
            )

    def setstate(self, state):
        self.raw_buffer = state[0]
        self.state = {0: 'M', 1: 'N', 2: 'S'}[state[1] & 3]
        self.inline_math = bool(state[1] & 4)

    def get_tokens(self, bytes_, final=False):
        """Yield tokens while maintaining a state. Also skip
        whitespace after control words and (some) control symbols.
        Replaces newlines by spaces and \\par commands depending on
        the context.
        """
        # current position relative to the start of bytes_ in the sequence
        # of bytes that have been decoded
        pos = -len(self.raw_buffer)
        for token in self.get_raw_tokens(bytes_, final=final):
            pos = pos + len(token)
            assert pos >= 0 # first token includes at least self.raw_buffer
            if token.name == 'newline':
                if self.state == 'N':
                    # if state was 'N', generate new paragraph
                    yield Token('control_word', b'\\par')
                elif self.state == 'S':
                    # switch to 'N' state, do not generate a space
                    self.state = 'N'
                elif self.state == 'M':
                    # switch to 'N' state, generate a space
                    self.state = 'N'
                    yield Token('space', b' ')
                else:
                    raise AssertionError(
                        "unknown tex state '%s'" % self.state)
            elif token.name == 'space':
                if self.state == 'N':
                    # remain in 'N' state, no space token generated
                    pass
                elif self.state == 'S':
                    # remain in 'S' state, no space token generated
                    pass
                elif self.state == 'M':
                    # in M mode, generate the space,
                    # but switch to space skip mode
                    self.state = 'S'
                    yield token
                else:
                    raise AssertionError(
                        "unknown state %s" % repr(self.state))
            elif token.name == 'char':
                self.state = 'M'
                yield token
            elif token.name == 'mathshift':
                self.inline_math = not self.inline_math
                yield token
            elif token.name == 'parameter':
                self.state = 'M'
                yield token
            elif token.name == 'control_word':
                # go to space skip mode
                self.state = 'S'
                yield token
            elif token.name == 'control_symbol':
                # go to space skip mode
                self.state = 'S'
                yield token
            elif token.name == 'control_symbol_x':
                # don't skip following space, so go to M mode
                self.state = 'M'
                yield token
            elif token.name == 'comment':
                # go to newline mode, no token is generated
                # note: comment includes the newline
                self.state = 'N'
            elif token.name == 'chars':
                self.state = 'M'
                yield token
            elif token.name == 'unknown':
                if self.errors == 'strict':
                    # current position within bytes_
                    # this is the position right after the unknown token
                    raise UnicodeDecodeError(
                        "latex", # codec
                        bytes_, # problematic input
                        pos - len(token), # start of problematic token
                        pos, # end of it
                        "unknown token %s" % repr(token.text))
                elif self.errors == 'ignore':
                    # do nothing
                    pass
                elif self.errors == 'replace':
                    yield Token('chars', b'?' * len(token))
                else:
                    raise NotImplementedError(
                        "error mode %s not supported" % repr(self.errors))

class LatexIncrementalDecoder(LatexIncrementalLexer):
    """Simple incremental decoder. Transforms lexed latex tokens into
    unicode.

    To customize decoding, subclass and override
    :meth:`get_unicode_tokens`.
    """

    inputenc = "ascii"
    """Input encoding. **Must** extend ascii."""

    def get_unicode_tokens(self, bytes_, final=False):
        """:meth:`decode` calls this function to produce the final
        sequence of unicode strings. This implementation simply
        decodes every sequence in *inputenc* encoding. Override to
        process the tokens in some other way (for example, for token
        translation).
        """
        for token in self.get_tokens(bytes_, final=final):
            yield token.decode(self.inputenc)

    def decode(self, bytes_, final=False):
        try:
            return u''.join(self.get_unicode_tokens(bytes_, final=final))
        except UnicodeDecodeError as e:
            # API requires that the encode method raises a ValueError
            # in this case
            raise ValueError(e)

class LatexIncrementalEncoder(codecs.IncrementalEncoder):
    """Simple incremental encoder for latex."""

    inputenc = "ascii"
    """Input encoding. **Must** extend ascii."""

    def get_latex_bytes(self, unicode_):
        """:meth:`encode` calls this function to produce the final
        sequence of latex bytes. This implementation simply
        encodes every sequence in *inputenc* encoding. Override to
        process the unicode in some other way (for example, for character
        translation).
        """
        if not isinstance(unicode_, basestring):
            raise TypeError(
                "expected unicode for encode input, but got {0} instead"
                .format(unicode_.__class__.__name__))
        for c in unicode_:
            yield c.encode(inputenc, self.errors)

    def encode(self, unicode_, final=False):
        """Encode unicode string into a latex byte sequence."""
        try:
            return b''.join(self.get_latex_bytes(unicode_, final=final))
        except (UnicodeDecodeError, e):
            # API requires that the encode method raises a ValueError
            # in this case
            raise ValueError(e)