PageRenderTime 116ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/2013spring/exts/sphinxcontrib/bibtex/latex_lexer.py

https://bitbucket.org/chiamingyen/mdecourse
Python | 322 lines | 235 code | 15 blank | 72 comment | 25 complexity | b5a78361b5d6309698652bd62c72e6bc MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. """
  3. Simple incremental latex lexer
  4. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  5. """
  6. import codecs
  7. import collections
  8. import re
  9. class Token(collections.namedtuple("Token", "name text")):
  10. """Stores information about a matched token."""
  11. __slots__ = () # efficiency
  12. def __new__(cls, name=None, text=None):
  13. return tuple.__new__(
  14. cls,
  15. (name if name is not None else 'unknown',
  16. text if text is not None else b''))
  17. def __nonzero__(self):
  18. return bool(self.text)
  19. def __len__(self):
  20. return len(self.text)
  21. def decode(self, encoding):
  22. if self.name == 'control_word':
  23. return self.text.decode(encoding) + u' '
  24. else:
  25. return self.text.decode(encoding)
  26. # implementation note: we derive from IncrementalDecoder because this
  27. # class serves excellently as a base class for incremental decoders,
  28. # but of course we don't decode yet until later
  29. class LatexLexer(codecs.IncrementalDecoder):
  30. """A very simple lexer for tex/latex code."""
  31. # implementation note: every token **must** be decodable by inputenc
  32. tokens = [
  33. # comment: for ease, and for speed, we handle it as a token
  34. ('comment', br'%.*?\n'),
  35. # control tokens
  36. # in latex, some control tokens skip following whitespace
  37. # ('control-word' and 'control-symbol')
  38. # others do not ('control-symbol-x')
  39. # XXX TBT says no control symbols skip whitespace (except '\ ')
  40. # XXX but tests reveal otherwise?
  41. ('control_word', br'[\\][a-zA-Z]+'),
  42. ('control_symbol', br'[\\][~' br"'" br'"` =^!]'),
  43. ('control_symbol_x', br'[\\][^a-zA-Z]'), # TODO should only match ascii
  44. # parameter tokens
  45. # also support a lone hash so we can lex things like b'#a'
  46. ('parameter', br'\#[0-9]|\#'),
  47. # any remaining characters; for ease we also handle space and
  48. # newline as tokens
  49. ('space', br' '),
  50. ('newline', br'\n'),
  51. ('mathshift', br'[$]'),
  52. # note: some chars joined together to make it easier to detect
  53. # symbols that have a special function (i.e. --, ---, etc.)
  54. ('chars',
  55. br'---|--|-|[`][`]'
  56. br"|['][']"
  57. br'|[?][`]|[!][`]'
  58. # separate chars because brackets are optional
  59. # e.g. fran\\c cais = fran\\c{c}ais in latex
  60. # so only way to detect \\c acting on c only is this way
  61. br'|[0-9a-zA-Z{}]'
  62. # we have to join everything else together to support
  63. # multibyte encodings: every token must be decodable!!
  64. # this means for instance that \\c öké is NOT equivalent to
  65. # \\c{ö}
  66. br'|[^ %#$\n\\]+'),
  67. # trailing garbage which we cannot decode otherwise
  68. # (such as a lone '\' at the end of a buffer)
  69. # is never emitted, but used internally by the buffer
  70. ('unknown', br'.'),
  71. ]
  72. def __init__(self, errors='strict'):
  73. """Initialize the codec."""
  74. self.errors = errors
  75. # regular expression used for matching
  76. self.regexp = re.compile(
  77. b"|".join(
  78. b"(?P<" + name.encode() + b">" + regexp + b")"
  79. for name, regexp in self.tokens),
  80. re.DOTALL)
  81. # reset state
  82. self.reset()
  83. def reset(self):
  84. """Reset state (also called by __init__ to initialize the
  85. state).
  86. """
  87. # buffer for storing last (possibly incomplete) token
  88. self.raw_buffer = Token()
  89. def getstate(self):
  90. return (self.raw_buffer.text, 0)
  91. def setstate(self, state):
  92. self.raw_buffer = Token('unknown', state[0])
  93. def get_raw_tokens(self, bytes_, final=False):
  94. """Yield tokens without any further processing. Tokens are one of:
  95. - ``\\<word>``: a control word (i.e. a command)
  96. - ``\\<symbol>``: a control symbol (i.e. \\^ etc.)
  97. - ``#<n>``: a parameter
  98. - a series of byte characters
  99. """
  100. if not isinstance(bytes_, bytes):
  101. raise TypeError(
  102. 'expected bytes but got %s'
  103. % bytes_.__class__.__name__)
  104. if self.raw_buffer:
  105. bytes_ = self.raw_buffer.text + bytes_
  106. self.raw_buffer = Token()
  107. for match in self.regexp.finditer(bytes_):
  108. for name, regexp in self.tokens:
  109. text = match.group(name)
  110. if text is not None:
  111. # yield the buffer token(s)
  112. for token in self.flush_raw_tokens():
  113. yield token
  114. # fill buffer with next token
  115. self.raw_buffer = Token(name, text)
  116. break
  117. else:
  118. # should not happen
  119. raise AssertionError("lexer failed on '%s'" % bytes_)
  120. if final:
  121. for token in self.flush_raw_tokens():
  122. yield token
  123. def flush_raw_tokens(self):
  124. """Flush the raw token buffer."""
  125. if self.raw_buffer:
  126. yield self.raw_buffer
  127. self.raw_buffer = Token()
  128. class LatexIncrementalLexer(LatexLexer):
  129. """A very simple incremental lexer for tex/latex code. Roughly
  130. follows the state machine described in Tex By Topic, Chapter 2.
  131. The generated tokens satisfy:
  132. * no newline characters: paragraphs are separated by '\\par'
  133. * spaces following control tokens are compressed
  134. """
  135. def reset(self):
  136. """Reset state (also called by __init__ to initialize the
  137. state).
  138. """
  139. LatexLexer.reset(self)
  140. # three possible states:
  141. # newline (N), skipping spaces (S), and middle of line (M)
  142. self.state = 'N'
  143. # inline math mode?
  144. self.inline_math = False
  145. def getstate(self):
  146. # state 'M' is most common, so let that be zero
  147. return (
  148. self.raw_buffer,
  149. {'M': 0, 'N': 1, 'S': 2}[self.state]
  150. | (4 if self.inline_math else 0)
  151. )
  152. def setstate(self, state):
  153. self.raw_buffer = state[0]
  154. self.state = {0: 'M', 1: 'N', 2: 'S'}[state[1] & 3]
  155. self.inline_math = bool(state[1] & 4)
  156. def get_tokens(self, bytes_, final=False):
  157. """Yield tokens while maintaining a state. Also skip
  158. whitespace after control words and (some) control symbols.
  159. Replaces newlines by spaces and \\par commands depending on
  160. the context.
  161. """
  162. # current position relative to the start of bytes_ in the sequence
  163. # of bytes that have been decoded
  164. pos = -len(self.raw_buffer)
  165. for token in self.get_raw_tokens(bytes_, final=final):
  166. pos = pos + len(token)
  167. assert pos >= 0 # first token includes at least self.raw_buffer
  168. if token.name == 'newline':
  169. if self.state == 'N':
  170. # if state was 'N', generate new paragraph
  171. yield Token('control_word', b'\\par')
  172. elif self.state == 'S':
  173. # switch to 'N' state, do not generate a space
  174. self.state = 'N'
  175. elif self.state == 'M':
  176. # switch to 'N' state, generate a space
  177. self.state = 'N'
  178. yield Token('space', b' ')
  179. else:
  180. raise AssertionError(
  181. "unknown tex state '%s'" % self.state)
  182. elif token.name == 'space':
  183. if self.state == 'N':
  184. # remain in 'N' state, no space token generated
  185. pass
  186. elif self.state == 'S':
  187. # remain in 'S' state, no space token generated
  188. pass
  189. elif self.state == 'M':
  190. # in M mode, generate the space,
  191. # but switch to space skip mode
  192. self.state = 'S'
  193. yield token
  194. else:
  195. raise AssertionError(
  196. "unknown state %s" % repr(self.state))
  197. elif token.name == 'char':
  198. self.state = 'M'
  199. yield token
  200. elif token.name == 'mathshift':
  201. self.inline_math = not self.inline_math
  202. yield token
  203. elif token.name == 'parameter':
  204. self.state = 'M'
  205. yield token
  206. elif token.name == 'control_word':
  207. # go to space skip mode
  208. self.state = 'S'
  209. yield token
  210. elif token.name == 'control_symbol':
  211. # go to space skip mode
  212. self.state = 'S'
  213. yield token
  214. elif token.name == 'control_symbol_x':
  215. # don't skip following space, so go to M mode
  216. self.state = 'M'
  217. yield token
  218. elif token.name == 'comment':
  219. # go to newline mode, no token is generated
  220. # note: comment includes the newline
  221. self.state = 'N'
  222. elif token.name == 'chars':
  223. self.state = 'M'
  224. yield token
  225. elif token.name == 'unknown':
  226. if self.errors == 'strict':
  227. # current position within bytes_
  228. # this is the position right after the unknown token
  229. raise UnicodeDecodeError(
  230. "latex", # codec
  231. bytes_, # problematic input
  232. pos - len(token), # start of problematic token
  233. pos, # end of it
  234. "unknown token %s" % repr(token.text))
  235. elif self.errors == 'ignore':
  236. # do nothing
  237. pass
  238. elif self.errors == 'replace':
  239. yield Token('chars', b'?' * len(token))
  240. else:
  241. raise NotImplementedError(
  242. "error mode %s not supported" % repr(self.errors))
  243. class LatexIncrementalDecoder(LatexIncrementalLexer):
  244. """Simple incremental decoder. Transforms lexed latex tokens into
  245. unicode.
  246. To customize decoding, subclass and override
  247. :meth:`get_unicode_tokens`.
  248. """
  249. inputenc = "ascii"
  250. """Input encoding. **Must** extend ascii."""
  251. def get_unicode_tokens(self, bytes_, final=False):
  252. """:meth:`decode` calls this function to produce the final
  253. sequence of unicode strings. This implementation simply
  254. decodes every sequence in *inputenc* encoding. Override to
  255. process the tokens in some other way (for example, for token
  256. translation).
  257. """
  258. for token in self.get_tokens(bytes_, final=final):
  259. yield token.decode(self.inputenc)
  260. def decode(self, bytes_, final=False):
  261. try:
  262. return u''.join(self.get_unicode_tokens(bytes_, final=final))
  263. except UnicodeDecodeError as e:
  264. # API requires that the encode method raises a ValueError
  265. # in this case
  266. raise ValueError(e)
  267. class LatexIncrementalEncoder(codecs.IncrementalEncoder):
  268. """Simple incremental encoder for latex."""
  269. inputenc = "ascii"
  270. """Input encoding. **Must** extend ascii."""
  271. def get_latex_bytes(self, unicode_):
  272. """:meth:`encode` calls this function to produce the final
  273. sequence of latex bytes. This implementation simply
  274. encodes every sequence in *inputenc* encoding. Override to
  275. process the unicode in some other way (for example, for character
  276. translation).
  277. """
  278. if not isinstance(unicode_, basestring):
  279. raise TypeError(
  280. "expected unicode for encode input, but got {0} instead"
  281. .format(unicode_.__class__.__name__))
  282. for c in unicode_:
  283. yield c.encode(inputenc, self.errors)
  284. def encode(self, unicode_, final=False):
  285. """Encode unicode string into a latex byte sequence."""
  286. try:
  287. return b''.join(self.get_latex_bytes(unicode_, final=final))
  288. except (UnicodeDecodeError, e):
  289. # API requires that the encode method raises a ValueError
  290. # in this case
  291. raise ValueError(e)