PageRenderTime 41ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/sqlparse/lexer.py

https://code.google.com/p/python-sqlparse/
Python | 323 lines | 316 code | 2 blank | 5 comment | 0 complexity | cdfe45a2dda2eda9863ee362b7d988f7 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. # -*- coding: utf-8 -*-
  2. # Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com
  3. #
  4. # This module is part of python-sqlparse and is released under
  5. # the BSD License: http://www.opensource.org/licenses/bsd-license.php.
  6. """SQL Lexer"""
  7. # This code is based on the SqlLexer in pygments.
  8. # http://pygments.org/
  9. # It's separated from the rest of pygments to increase performance
  10. # and to allow some customizations.
  11. import re
  12. from sqlparse import tokens
  13. from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON
  14. class include(str):
  15. pass
  16. class combined(tuple):
  17. """Indicates a state combined from multiple states."""
  18. def __new__(cls, *args):
  19. return tuple.__new__(cls, args)
  20. def __init__(self, *args):
  21. # tuple.__init__ doesn't do anything
  22. pass
  23. def is_keyword(value):
  24. test = value.upper()
  25. return KEYWORDS_COMMON.get(test, KEYWORDS.get(test, tokens.Name)), value
  26. def apply_filters(stream, filters, lexer=None):
  27. """
  28. Use this method to apply an iterable of filters to
  29. a stream. If lexer is given it's forwarded to the
  30. filter, otherwise the filter receives `None`.
  31. """
  32. def _apply(filter_, stream):
  33. for token in filter_.filter(lexer, stream):
  34. yield token
  35. for filter_ in filters:
  36. stream = _apply(filter_, stream)
  37. return stream
  38. class LexerMeta(type):
  39. """
  40. Metaclass for Lexer, creates the self._tokens attribute from
  41. self.tokens on the first instantiation.
  42. """
  43. def _process_state(cls, unprocessed, processed, state):
  44. assert type(state) is str, "wrong state name %r" % state
  45. assert state[0] != '#', "invalid state name %r" % state
  46. if state in processed:
  47. return processed[state]
  48. tokenlist = processed[state] = []
  49. rflags = cls.flags
  50. for tdef in unprocessed[state]:
  51. if isinstance(tdef, include):
  52. # it's a state reference
  53. assert tdef != state, "circular state reference %r" % state
  54. tokenlist.extend(cls._process_state(
  55. unprocessed, processed, str(tdef)))
  56. continue
  57. assert type(tdef) is tuple, "wrong rule def %r" % tdef
  58. try:
  59. rex = re.compile(tdef[0], rflags).match
  60. except Exception, err:
  61. raise ValueError(("uncompilable regex %r in state"
  62. " %r of %r: %s"
  63. % (tdef[0], state, cls, err)))
  64. assert type(tdef[1]) is tokens._TokenType or callable(tdef[1]), \
  65. ('token type must be simple type or callable, not %r'
  66. % (tdef[1],))
  67. if len(tdef) == 2:
  68. new_state = None
  69. else:
  70. tdef2 = tdef[2]
  71. if isinstance(tdef2, str):
  72. # an existing state
  73. if tdef2 == '#pop':
  74. new_state = -1
  75. elif tdef2 in unprocessed:
  76. new_state = (tdef2,)
  77. elif tdef2 == '#push':
  78. new_state = tdef2
  79. elif tdef2[:5] == '#pop:':
  80. new_state = -int(tdef2[5:])
  81. else:
  82. assert False, 'unknown new state %r' % tdef2
  83. elif isinstance(tdef2, combined):
  84. # combine a new state from existing ones
  85. new_state = '_tmp_%d' % cls._tmpname
  86. cls._tmpname += 1
  87. itokens = []
  88. for istate in tdef2:
  89. assert istate != state, \
  90. 'circular state ref %r' % istate
  91. itokens.extend(cls._process_state(unprocessed,
  92. processed, istate))
  93. processed[new_state] = itokens
  94. new_state = (new_state,)
  95. elif isinstance(tdef2, tuple):
  96. # push more than one state
  97. for state in tdef2:
  98. assert (state in unprocessed or
  99. state in ('#pop', '#push')), \
  100. 'unknown new state ' + state
  101. new_state = tdef2
  102. else:
  103. assert False, 'unknown new state def %r' % tdef2
  104. tokenlist.append((rex, tdef[1], new_state))
  105. return tokenlist
  106. def process_tokendef(cls):
  107. cls._all_tokens = {}
  108. cls._tmpname = 0
  109. processed = cls._all_tokens[cls.__name__] = {}
  110. #tokendefs = tokendefs or cls.tokens[name]
  111. for state in cls.tokens.keys():
  112. cls._process_state(cls.tokens, processed, state)
  113. return processed
  114. def __call__(cls, *args, **kwds):
  115. if not hasattr(cls, '_tokens'):
  116. cls._all_tokens = {}
  117. cls._tmpname = 0
  118. if hasattr(cls, 'token_variants') and cls.token_variants:
  119. # don't process yet
  120. pass
  121. else:
  122. cls._tokens = cls.process_tokendef()
  123. return type.__call__(cls, *args, **kwds)
  124. class Lexer(object):
  125. __metaclass__ = LexerMeta
  126. encoding = 'utf-8'
  127. stripall = False
  128. stripnl = False
  129. tabsize = 0
  130. flags = re.IGNORECASE
  131. tokens = {
  132. 'root': [
  133. (r'--.*?(\r\n|\r|\n)', tokens.Comment.Single),
  134. # $ matches *before* newline, therefore we have two patterns
  135. # to match Comment.Single
  136. (r'--.*?$', tokens.Comment.Single),
  137. (r'(\r|\n|\r\n)', tokens.Newline),
  138. (r'\s+', tokens.Whitespace),
  139. (r'/\*', tokens.Comment.Multiline, 'multiline-comments'),
  140. (r':=', tokens.Assignment),
  141. (r'::', tokens.Punctuation),
  142. (r'[*]', tokens.Wildcard),
  143. (r'CASE\b', tokens.Keyword), # extended CASE(foo)
  144. (r"`(``|[^`])*`", tokens.Name),
  145. (r"´(´´|[^´])*´", tokens.Name),
  146. (r'\$([a-zA-Z_][a-zA-Z0-9_]*)?\$', tokens.Name.Builtin),
  147. (r'\?{1}', tokens.Name.Placeholder),
  148. (r'[$:?%][a-zA-Z0-9_]+', tokens.Name.Placeholder),
  149. (r'@[a-zA-Z_][a-zA-Z0-9_]+', tokens.Name),
  150. (r'[a-zA-Z_][a-zA-Z0-9_]*(?=[.(])', tokens.Name), # see issue39
  151. (r'[-]?0x[0-9a-fA-F]+', tokens.Number.Hexadecimal),
  152. (r'[-]?[0-9]*\.[0-9]+', tokens.Number.Float),
  153. (r'[-]?[0-9]+', tokens.Number.Integer),
  154. # TODO: Backslash escapes?
  155. (r"(''|'.*?[^\\]')", tokens.String.Single),
  156. # not a real string literal in ANSI SQL:
  157. (r'(""|".*?[^\\]")', tokens.String.Symbol),
  158. (r'(\[.*[^\]]\])', tokens.Name),
  159. (r'(LEFT |RIGHT )?(INNER |OUTER )?JOIN\b', tokens.Keyword),
  160. (r'END( IF| LOOP)?\b', tokens.Keyword),
  161. (r'NOT NULL\b', tokens.Keyword),
  162. (r'CREATE( OR REPLACE)?\b', tokens.Keyword.DDL),
  163. (r'(?<=\.)[a-zA-Z_][a-zA-Z0-9_]*', tokens.Name),
  164. (r'[a-zA-Z_][a-zA-Z0-9_]*', is_keyword),
  165. (r'[;:()\[\],\.]', tokens.Punctuation),
  166. (r'[<>=~!]+', tokens.Operator.Comparison),
  167. (r'[+/@#%^&|`?^-]+', tokens.Operator),
  168. ],
  169. 'multiline-comments': [
  170. (r'/\*', tokens.Comment.Multiline, 'multiline-comments'),
  171. (r'\*/', tokens.Comment.Multiline, '#pop'),
  172. (r'[^/\*]+', tokens.Comment.Multiline),
  173. (r'[/*]', tokens.Comment.Multiline)
  174. ]}
  175. def __init__(self):
  176. self.filters = []
  177. def add_filter(self, filter_, **options):
  178. from sqlparse.filters import Filter
  179. if not isinstance(filter_, Filter):
  180. filter_ = filter_(**options)
  181. self.filters.append(filter_)
  182. def get_tokens(self, text, unfiltered=False):
  183. """
  184. Return an iterable of (tokentype, value) pairs generated from
  185. `text`. If `unfiltered` is set to `True`, the filtering mechanism
  186. is bypassed even if filters are defined.
  187. Also preprocess the text, i.e. expand tabs and strip it if
  188. wanted and applies registered filters.
  189. """
  190. if not isinstance(text, unicode):
  191. if self.encoding == 'guess':
  192. try:
  193. text = text.decode('utf-8')
  194. if text.startswith(u'\ufeff'):
  195. text = text[len(u'\ufeff'):]
  196. except UnicodeDecodeError:
  197. text = text.decode('latin1')
  198. else:
  199. text = text.decode(self.encoding)
  200. if self.stripall:
  201. text = text.strip()
  202. elif self.stripnl:
  203. text = text.strip('\n')
  204. if self.tabsize > 0:
  205. text = text.expandtabs(self.tabsize)
  206. # if not text.endswith('\n'):
  207. # text += '\n'
  208. def streamer():
  209. for i, t, v in self.get_tokens_unprocessed(text):
  210. yield t, v
  211. stream = streamer()
  212. if not unfiltered:
  213. stream = apply_filters(stream, self.filters, self)
  214. return stream
  215. def get_tokens_unprocessed(self, text, stack=('root',)):
  216. """
  217. Split ``text`` into (tokentype, text) pairs.
  218. ``stack`` is the inital stack (default: ``['root']``)
  219. """
  220. pos = 0
  221. tokendefs = self._tokens # see __call__, pylint:disable=E1101
  222. statestack = list(stack)
  223. statetokens = tokendefs[statestack[-1]]
  224. known_names = {}
  225. while 1:
  226. for rexmatch, action, new_state in statetokens:
  227. m = rexmatch(text, pos)
  228. if m:
  229. # print rex.pattern
  230. value = m.group()
  231. if value in known_names:
  232. yield pos, known_names[value], value
  233. elif type(action) is tokens._TokenType:
  234. yield pos, action, value
  235. elif hasattr(action, '__call__'):
  236. ttype, value = action(value)
  237. known_names[value] = ttype
  238. yield pos, ttype, value
  239. else:
  240. for item in action(self, m):
  241. yield item
  242. pos = m.end()
  243. if new_state is not None:
  244. # state transition
  245. if isinstance(new_state, tuple):
  246. for state in new_state:
  247. if state == '#pop':
  248. statestack.pop()
  249. elif state == '#push':
  250. statestack.append(statestack[-1])
  251. else:
  252. statestack.append(state)
  253. elif isinstance(new_state, int):
  254. # pop
  255. del statestack[new_state:]
  256. elif new_state == '#push':
  257. statestack.append(statestack[-1])
  258. else:
  259. assert False, "wrong state def: %r" % new_state
  260. statetokens = tokendefs[statestack[-1]]
  261. break
  262. else:
  263. try:
  264. if text[pos] == '\n':
  265. # at EOL, reset state to "root"
  266. pos += 1
  267. statestack = ['root']
  268. statetokens = tokendefs['root']
  269. yield pos, tokens.Text, u'\n'
  270. continue
  271. yield pos, tokens.Error, text[pos]
  272. pos += 1
  273. except IndexError:
  274. break
  275. def tokenize(sql):
  276. """Tokenize sql.
  277. Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
  278. of ``(token type, value)`` items.
  279. """
  280. lexer = Lexer()
  281. return lexer.get_tokens(sql)