PageRenderTime 86ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/debug_toolbar/utils/sqlparse/lexer.py

https://github.com/hakanw/django-debug-toolbar
Python | 331 lines | 324 code | 2 blank | 5 comment | 0 complexity | 9d691decd73d3e2357c692e8b30fc3cd MD5 | raw file
  1. # -*- coding: utf-8 -*-
  2. # Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com
  3. #
  4. # This module is part of python-sqlparse and is released under
  5. # the BSD License: http://www.opensource.org/licenses/bsd-license.php.
  6. """SQL Lexer"""
  7. # This code is based on the SqlLexer in pygments.
  8. # http://pygments.org/
  9. # It's separated from the rest of pygments to increase performance
  10. # and to allow some customizations.
  11. import re
  12. from debug_toolbar.utils.sqlparse import tokens
  13. from debug_toolbar.utils.sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON
  14. class include(str):
  15. pass
  16. class combined(tuple):
  17. """Indicates a state combined from multiple states."""
  18. def __new__(cls, *args):
  19. return tuple.__new__(cls, args)
  20. def __init__(self, *args):
  21. # tuple.__init__ doesn't do anything
  22. pass
  23. def is_keyword(value):
  24. test = value.upper()
  25. return KEYWORDS_COMMON.get(test, KEYWORDS.get(test, tokens.Name)), value
  26. def apply_filters(stream, filters, lexer=None):
  27. """
  28. Use this method to apply an iterable of filters to
  29. a stream. If lexer is given it's forwarded to the
  30. filter, otherwise the filter receives `None`.
  31. """
  32. def _apply(filter_, stream):
  33. for token in filter_.filter(lexer, stream):
  34. yield token
  35. for filter_ in filters:
  36. stream = _apply(filter_, stream)
  37. return stream
  38. class LexerMeta(type):
  39. """
  40. Metaclass for Lexer, creates the self._tokens attribute from
  41. self.tokens on the first instantiation.
  42. """
  43. def _process_state(cls, unprocessed, processed, state):
  44. assert type(state) is str, "wrong state name %r" % state
  45. assert state[0] != '#', "invalid state name %r" % state
  46. if state in processed:
  47. return processed[state]
  48. tokenlist = processed[state] = []
  49. rflags = cls.flags
  50. for tdef in unprocessed[state]:
  51. if isinstance(tdef, include):
  52. # it's a state reference
  53. assert tdef != state, "circular state reference %r" % state
  54. tokenlist.extend(cls._process_state(
  55. unprocessed, processed, str(tdef)))
  56. continue
  57. assert type(tdef) is tuple, "wrong rule def %r" % tdef
  58. try:
  59. rex = re.compile(tdef[0], rflags).match
  60. except Exception, err:
  61. raise ValueError(("uncompilable regex %r in state"
  62. " %r of %r: %s"
  63. % (tdef[0], state, cls, err)))
  64. assert type(tdef[1]) is tokens._TokenType or callable(tdef[1]), \
  65. ('token type must be simple type or callable, not %r'
  66. % (tdef[1],))
  67. if len(tdef) == 2:
  68. new_state = None
  69. else:
  70. tdef2 = tdef[2]
  71. if isinstance(tdef2, str):
  72. # an existing state
  73. if tdef2 == '#pop':
  74. new_state = -1
  75. elif tdef2 in unprocessed:
  76. new_state = (tdef2,)
  77. elif tdef2 == '#push':
  78. new_state = tdef2
  79. elif tdef2[:5] == '#pop:':
  80. new_state = -int(tdef2[5:])
  81. else:
  82. assert False, 'unknown new state %r' % tdef2
  83. elif isinstance(tdef2, combined):
  84. # combine a new state from existing ones
  85. new_state = '_tmp_%d' % cls._tmpname
  86. cls._tmpname += 1
  87. itokens = []
  88. for istate in tdef2:
  89. assert istate != state, \
  90. 'circular state ref %r' % istate
  91. itokens.extend(cls._process_state(unprocessed,
  92. processed, istate))
  93. processed[new_state] = itokens
  94. new_state = (new_state,)
  95. elif isinstance(tdef2, tuple):
  96. # push more than one state
  97. for state in tdef2:
  98. assert (state in unprocessed or
  99. state in ('#pop', '#push')), \
  100. 'unknown new state ' + state
  101. new_state = tdef2
  102. else:
  103. assert False, 'unknown new state def %r' % tdef2
  104. tokenlist.append((rex, tdef[1], new_state))
  105. return tokenlist
  106. def process_tokendef(cls):
  107. cls._all_tokens = {}
  108. cls._tmpname = 0
  109. processed = cls._all_tokens[cls.__name__] = {}
  110. #tokendefs = tokendefs or cls.tokens[name]
  111. for state in cls.tokens.keys():
  112. cls._process_state(cls.tokens, processed, state)
  113. return processed
  114. def __call__(cls, *args, **kwds):
  115. if not hasattr(cls, '_tokens'):
  116. cls._all_tokens = {}
  117. cls._tmpname = 0
  118. if hasattr(cls, 'token_variants') and cls.token_variants:
  119. # don't process yet
  120. pass
  121. else:
  122. cls._tokens = cls.process_tokendef()
  123. return type.__call__(cls, *args, **kwds)
  124. class Lexer(object):
  125. __metaclass__ = LexerMeta
  126. encoding = 'utf-8'
  127. stripall = False
  128. stripnl = False
  129. tabsize = 0
  130. flags = re.IGNORECASE
  131. tokens = {
  132. 'root': [
  133. (r'--.*?(\r\n|\r|\n)', tokens.Comment.Single),
  134. # $ matches *before* newline, therefore we have two patterns
  135. # to match Comment.Single
  136. (r'--.*?$', tokens.Comment.Single),
  137. (r'(\r|\n|\r\n)', tokens.Newline),
  138. (r'\s+', tokens.Whitespace),
  139. (r'/\*', tokens.Comment.Multiline, 'multiline-comments'),
  140. (r':=', tokens.Assignment),
  141. (r'::', tokens.Punctuation),
  142. (r'[*]', tokens.Wildcard),
  143. (r'CASE\b', tokens.Keyword), # extended CASE(foo)
  144. (r"`(``|[^`])*`", tokens.Name),
  145. (r"´(´´|[^´])*´", tokens.Name),
  146. (r'\$([a-zA-Z_][a-zA-Z0-9_]*)?\$', tokens.Name.Builtin),
  147. (r'\?{1}', tokens.Name.Placeholder),
  148. (r'[$:?%][a-zA-Z0-9_]+[^$:?%]?', tokens.Name.Placeholder),
  149. (r'@[a-zA-Z_][a-zA-Z0-9_]+', tokens.Name),
  150. (r'[a-zA-Z_][a-zA-Z0-9_]*(?=[.(])', tokens.Name), # see issue39
  151. (r'[<>=~!]+', tokens.Operator.Comparison),
  152. (r'[+/@#%^&|`?^-]+', tokens.Operator),
  153. (r'0x[0-9a-fA-F]+', tokens.Number.Hexadecimal),
  154. (r'[0-9]*\.[0-9]+', tokens.Number.Float),
  155. (r'[0-9]+', tokens.Number.Integer),
  156. # TODO: Backslash escapes?
  157. (r"(''|'.*?[^\\]')", tokens.String.Single),
  158. # not a real string literal in ANSI SQL:
  159. (r'(""|".*?[^\\]")', tokens.String.Symbol),
  160. (r'(\[.*[^\]]\])', tokens.Name),
  161. (r'(LEFT |RIGHT )?(INNER |OUTER )?JOIN\b', tokens.Keyword),
  162. (r'END( IF| LOOP)?\b', tokens.Keyword),
  163. (r'NOT NULL\b', tokens.Keyword),
  164. (r'CREATE( OR REPLACE)?\b', tokens.Keyword.DDL),
  165. (r'[a-zA-Z_][a-zA-Z0-9_]*', is_keyword),
  166. (r'[;:()\[\],\.]', tokens.Punctuation),
  167. ],
  168. 'multiline-comments': [
  169. (r'/\*', tokens.Comment.Multiline, 'multiline-comments'),
  170. (r'\*/', tokens.Comment.Multiline, '#pop'),
  171. (r'[^/\*]+', tokens.Comment.Multiline),
  172. (r'[/*]', tokens.Comment.Multiline)
  173. ]}
  174. def __init__(self):
  175. self.filters = []
  176. def add_filter(self, filter_, **options):
  177. from debug_toolbar.utils.sqlparse.filters import Filter
  178. if not isinstance(filter_, Filter):
  179. filter_ = filter_(**options)
  180. self.filters.append(filter_)
  181. def get_tokens(self, text, unfiltered=False):
  182. """
  183. Return an iterable of (tokentype, value) pairs generated from
  184. `text`. If `unfiltered` is set to `True`, the filtering mechanism
  185. is bypassed even if filters are defined.
  186. Also preprocess the text, i.e. expand tabs and strip it if
  187. wanted and applies registered filters.
  188. """
  189. if not isinstance(text, unicode):
  190. if self.encoding == 'guess':
  191. try:
  192. text = text.decode('utf-8')
  193. if text.startswith(u'\ufeff'):
  194. text = text[len(u'\ufeff'):]
  195. except UnicodeDecodeError:
  196. text = text.decode('latin1')
  197. elif self.encoding == 'chardet':
  198. try:
  199. import chardet
  200. except ImportError:
  201. raise ImportError('To enable chardet encoding guessing, '
  202. 'please install the chardet library '
  203. 'from http://chardet.feedparser.org/')
  204. enc = chardet.detect(text)
  205. text = text.decode(enc['encoding'])
  206. else:
  207. text = text.decode(self.encoding)
  208. if self.stripall:
  209. text = text.strip()
  210. elif self.stripnl:
  211. text = text.strip('\n')
  212. if self.tabsize > 0:
  213. text = text.expandtabs(self.tabsize)
  214. # if not text.endswith('\n'):
  215. # text += '\n'
  216. def streamer():
  217. for i, t, v in self.get_tokens_unprocessed(text):
  218. yield t, v
  219. stream = streamer()
  220. if not unfiltered:
  221. stream = apply_filters(stream, self.filters, self)
  222. return stream
  223. def get_tokens_unprocessed(self, text, stack=('root',)):
  224. """
  225. Split ``text`` into (tokentype, text) pairs.
  226. ``stack`` is the inital stack (default: ``['root']``)
  227. """
  228. pos = 0
  229. tokendefs = self._tokens
  230. statestack = list(stack)
  231. statetokens = tokendefs[statestack[-1]]
  232. known_names = {}
  233. while 1:
  234. for rexmatch, action, new_state in statetokens:
  235. m = rexmatch(text, pos)
  236. if m:
  237. # print rex.pattern
  238. value = m.group()
  239. if value in known_names:
  240. yield pos, known_names[value], value
  241. elif type(action) is tokens._TokenType:
  242. yield pos, action, value
  243. elif hasattr(action, '__call__'):
  244. ttype, value = action(value)
  245. known_names[value] = ttype
  246. yield pos, ttype, value
  247. else:
  248. for item in action(self, m):
  249. yield item
  250. pos = m.end()
  251. if new_state is not None:
  252. # state transition
  253. if isinstance(new_state, tuple):
  254. for state in new_state:
  255. if state == '#pop':
  256. statestack.pop()
  257. elif state == '#push':
  258. statestack.append(statestack[-1])
  259. else:
  260. statestack.append(state)
  261. elif isinstance(new_state, int):
  262. # pop
  263. del statestack[new_state:]
  264. elif new_state == '#push':
  265. statestack.append(statestack[-1])
  266. else:
  267. assert False, "wrong state def: %r" % new_state
  268. statetokens = tokendefs[statestack[-1]]
  269. break
  270. else:
  271. try:
  272. if text[pos] == '\n':
  273. # at EOL, reset state to "root"
  274. pos += 1
  275. statestack = ['root']
  276. statetokens = tokendefs['root']
  277. yield pos, tokens.Text, u'\n'
  278. continue
  279. yield pos, tokens.Error, text[pos]
  280. pos += 1
  281. except IndexError:
  282. break
  283. def tokenize(sql):
  284. """Tokenize sql.
  285. Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
  286. of ``(token type, value)`` items.
  287. """
  288. lexer = Lexer()
  289. return lexer.get_tokens(sql)