PageRenderTime 49ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/tokenize.py

http://github.com/bengardner/uncrustify
Python | 315 lines | 291 code | 8 blank | 16 comment | 29 complexity | c70bd9170fea52d208ad2cc302dd4695 MD5 | raw file
Possible License(s): GPL-2.0
  1. #! /usr/bin/env python
  2. # tokenize.py
  3. #
  4. # Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of
  5. # tuples (string, type)
  6. #
  7. # punctuator lookup table
  8. punc_table = [
  9. [ '!', 25, 26, '!' ], # 0: '!'
  10. [ '#', 24, 35, '#' ], # 1: '#'
  11. [ '$', 23, 0, '$' ], # 2: '$'
  12. [ '%', 22, 36, '%' ], # 3: '%'
  13. [ '&', 21, 41, '&' ], # 4: '&'
  14. [ '(', 20, 0, '(' ], # 5: '('
  15. [ ')', 19, 0, ')' ], # 6: ')'
  16. [ '*', 18, 43, '*' ], # 7: '*'
  17. [ '+', 17, 44, '+' ], # 8: '+'
  18. [ ',', 16, 0, ',' ], # 9: ','
  19. [ '-', 15, 46, '-' ], # 10: '-'
  20. [ '.', 14, 50, '.' ], # 11: '.'
  21. [ '/', 13, 53, '/' ], # 12: '/'
  22. [ ':', 12, 54, ':' ], # 13: ':'
  23. [ ';', 11, 0, ';' ], # 14: ';'
  24. [ '<', 10, 56, '<' ], # 15: '<'
  25. [ '=', 9, 63, '=' ], # 16: '='
  26. [ '>', 8, 65, '>' ], # 17: '>'
  27. [ '?', 7, 0, '?' ], # 18: '?'
  28. [ '[', 6, 70, '[' ], # 19: '['
  29. [ ']', 5, 0, ']' ], # 20: ']'
  30. [ '^', 4, 71, '^' ], # 21: '^'
  31. [ '{', 3, 0, '{' ], # 22: '{'
  32. [ '|', 2, 72, '|' ], # 23: '|'
  33. [ '}', 1, 0, '}' ], # 24: '}'
  34. [ '~', 0, 74, '~' ], # 25: '~'
  35. [ '<', 3, 30, '!<' ], # 26: '!<'
  36. [ '=', 2, 33, '!=' ], # 27: '!='
  37. [ '>', 1, 34, '!>' ], # 28: '!>'
  38. [ '~', 0, 0, '!~' ], # 29: '!~'
  39. [ '=', 1, 0, '!<=' ], # 30: '!<='
  40. [ '>', 0, 32, '!<>' ], # 31: '!<>'
  41. [ '=', 0, 0, '!<>='], # 32: '!<>='
  42. [ '=', 0, 0, '!==' ], # 33: '!=='
  43. [ '=', 0, 0, '!>=' ], # 34: '!>='
  44. [ '#', 0, 0, '##' ], # 35: '##'
  45. [ ':', 2, 39, '%:' ], # 36: '%:'
  46. [ '=', 1, 0, '%=' ], # 37: '%='
  47. [ '>', 0, 0, '%>' ], # 38: '%>'
  48. [ '%', 0, 40, None ], # 39: '%:%'
  49. [ ':', 0, 0, '%:%:'], # 40: '%:%:'
  50. [ '&', 1, 0, '&&' ], # 41: '&&'
  51. [ '=', 0, 0, '&=' ], # 42: '&='
  52. [ '=', 0, 0, '*=' ], # 43: '*='
  53. [ '+', 1, 0, '++' ], # 44: '++'
  54. [ '=', 0, 0, '+=' ], # 45: '+='
  55. [ '-', 2, 0, '--' ], # 46: '--'
  56. [ '=', 1, 0, '-=' ], # 47: '-='
  57. [ '>', 0, 49, '->' ], # 48: '->'
  58. [ '*', 0, 0, '->*' ], # 49: '->*'
  59. [ '*', 1, 0, '.*' ], # 50: '.*'
  60. [ '.', 0, 52, '..' ], # 51: '..'
  61. [ '.', 0, 0, '...' ], # 52: '...'
  62. [ '=', 0, 0, '/=' ], # 53: '/='
  63. [ ':', 1, 0, '::' ], # 54: '::'
  64. [ '>', 0, 0, ':>' ], # 55: ':>'
  65. [ '%', 4, 0, '<%' ], # 56: '<%'
  66. [ ':', 3, 0, '<:' ], # 57: '<:'
  67. [ '<', 2, 61, '<<' ], # 58: '<<'
  68. [ '=', 1, 0, '<=' ], # 59: '<='
  69. [ '>', 0, 62, '<>' ], # 60: '<>'
  70. [ '=', 0, 0, '<<=' ], # 61: '<<='
  71. [ '=', 0, 0, '<>=' ], # 62: '<>='
  72. [ '=', 0, 64, '==' ], # 63: '=='
  73. [ '=', 0, 0, '===' ], # 64: '==='
  74. [ '=', 1, 0, '>=' ], # 65: '>='
  75. [ '>', 0, 67, '>>' ], # 66: '>>'
  76. [ '=', 1, 0, '>>=' ], # 67: '>>='
  77. [ '>', 0, 69, '>>>' ], # 68: '>>>'
  78. [ '=', 0, 0, '>>>='], # 69: '>>>='
  79. [ ']', 0, 0, '[]' ], # 70: '[]'
  80. [ '=', 0, 0, '^=' ], # 71: '^='
  81. [ '=', 1, 0, '|=' ], # 72: '|='
  82. [ '|', 0, 0, '||' ], # 73: '||'
  83. [ '=', 1, 0, '~=' ], # 74: '~='
  84. [ '~', 0, 0, '~~' ], # 75: '~~'
  85. ]
  86. #
  87. # Token types:
  88. # 0 = newline
  89. # 1 = punctuator
  90. # 2 = integer
  91. # 3 = float
  92. # 4 = string
  93. # 5 = identifier
  94. #
  95. class tokenizer:
  96. def __init__(self):
  97. self.tokens = []
  98. self.text = ''
  99. self.text_idx = 0
  100. def tokenize_text (self, in_text):
  101. self.tokens = []
  102. self.text = in_text
  103. self.text_idx = 0
  104. print in_text
  105. try:
  106. while self.text_idx < len(self.text):
  107. if self.parse_whitespace():
  108. continue
  109. elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n':
  110. self.text_idx += 2
  111. continue
  112. elif self.parse_comment():
  113. continue
  114. elif self.parse_number():
  115. continue
  116. elif self.parse_identifier():
  117. continue
  118. elif self.parse_string():
  119. continue
  120. elif self.parse_punctuator():
  121. continue
  122. else:
  123. print 'confused:', self.text[self.text_idx:]
  124. break
  125. except:
  126. print 'bombed'
  127. raise
  128. def parse_whitespace(self):
  129. start_idx = self.text_idx
  130. hit_newline = False
  131. while self.text_idx < len(self.text):
  132. if self.text[self.text_idx] in '\n\r':
  133. hit_newline = True
  134. elif not self.text[self.text_idx] in ' \t':
  135. break
  136. self.text_idx += 1
  137. if hit_newline:
  138. self.tokens.append(('\n', 0))
  139. return start_idx != self.text_idx
  140. def parse_comment(self):
  141. if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*':
  142. return False
  143. if self.text[self.text_idx + 1] == '/':
  144. while self.text_idx < len(self.text):
  145. if self.text[self.text_idx] in '\n\r':
  146. break;
  147. self.text_idx += 1
  148. else:
  149. while self.text_idx < len(self.text) - 1:
  150. if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/':
  151. self.text_idx += 2
  152. break;
  153. self.text_idx += 1
  154. return True
  155. def parse_identifier(self):
  156. if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ':
  157. return False
  158. start_idx = self.text_idx
  159. while self.text_idx < len(self.text) and self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890':
  160. self.text_idx += 1
  161. self.tokens.append((self.text[start_idx : self.text_idx], 5))
  162. return True
  163. def parse_string(self):
  164. starter = 0
  165. start_ch = self.text[self.text_idx]
  166. if start_ch == 'L':
  167. starter = 1
  168. start_ch = self.text[self.text_idx + 1]
  169. if not start_ch in '"\'':
  170. return False
  171. start_idx = self.text_idx
  172. self.text_idx += starter + 1
  173. escaped = False
  174. while self.text_idx < len(self.text):
  175. if escaped:
  176. escaped = False
  177. else:
  178. if self.text[self.text_idx] == '\\':
  179. escaped = True
  180. elif self.text[self.text_idx] == start_ch:
  181. self.text_idx += 1
  182. break;
  183. self.text_idx += 1
  184. self.tokens.append((self.text[start_idx : self.text_idx], 4))
  185. return True
  186. # Checks for punctuators
  187. # Returns whether a punctuator was consumed (True or False)
  188. def parse_punctuator(self):
  189. tab_idx = 0
  190. punc_len = 0
  191. saved_punc = None
  192. while 1:
  193. pte = punc_table[tab_idx]
  194. if pte[0] == self.text[self.text_idx]:
  195. if pte[3] != None:
  196. saved_punc = pte[3]
  197. self.text_idx += 1
  198. tab_idx = pte[2]
  199. if tab_idx == 0:
  200. break
  201. elif pte[1] == 0:
  202. break
  203. else:
  204. tab_idx += 1
  205. if saved_punc != None:
  206. self.tokens.append((saved_punc, 1))
  207. return True
  208. return False
  209. def parse_number(self):
  210. # A number must start with a digit or a dot followed by a digit
  211. ch = self.text[self.text_idx]
  212. if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()):
  213. return False;
  214. token_type = 2 # integer
  215. if (ch == '.'):
  216. token_type = 3 # float
  217. did_hex = False;
  218. start_idx = self.text_idx
  219. # Check for Hex, Octal, or Binary
  220. # Note that only D and Pawn support binary, but who cares?
  221. #
  222. if ch == '0':
  223. self.text_idx += 1
  224. ch = self.text[self.text_idx].upper()
  225. if ch == 'X': # hex
  226. did_hex = True
  227. self.text_idx += 1
  228. while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
  229. self.text_idx += 1
  230. elif ch == 'B': # binary
  231. self.text_idx += 1
  232. while self.text[self.text_idx] in '_01':
  233. self.text_idx += 1
  234. elif ch >= '0' and ch <= 7: # octal (but allow decimal)
  235. self.text_idx += 1
  236. while self.text[self.text_idx] in '_0123456789':
  237. self.text_idx += 1
  238. else:
  239. # either just 0 or 0.1 or 0UL, etc
  240. pass
  241. else:
  242. # Regular int or float
  243. while self.text[self.text_idx] in '_0123456789':
  244. self.text_idx += 1
  245. # Check if we stopped on a decimal point
  246. if self.text[self.text_idx] == '.':
  247. self.text_idx += 1
  248. token_type = 3 # float
  249. if did_hex:
  250. while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
  251. self.text_idx += 1
  252. else:
  253. while self.text[self.text_idx] in '_0123456789':
  254. self.text_idx += 1
  255. # Check exponent
  256. # Valid exponents per language (not that it matters):
  257. # C/C++/D/Java: eEpP
  258. # C#/Pawn: eE
  259. if self.text[self.text_idx] in 'eEpP':
  260. token_type = 3 # float
  261. self.text_idx += 1
  262. if self.text[self.text_idx] in '+-':
  263. self.text_idx += 1
  264. while self.text[self.text_idx] in '_0123456789':
  265. self.text_idx += 1
  266. # Check the suffixes
  267. # Valid suffixes per language (not that it matters):
  268. # Integer Float
  269. # C/C++: uUlL lLfF
  270. # C#: uUlL fFdDMm
  271. # D: uUL ifFL
  272. # Java: lL fFdD
  273. # Pawn: (none) (none)
  274. #
  275. # Note that i, f, d, and m only appear in floats.
  276. while 1:
  277. if self.text[self.text_idx] in 'tTfFdDmM':
  278. token_type = 3 # float
  279. elif not self.text[self.text_idx] in 'lLuU':
  280. break;
  281. self.text_idx += 1
  282. self.tokens.append((self.text[start_idx : self.text_idx], token_type))
  283. return True
  284. text = """
  285. 1.23+4-3*16%2 *sin(1.e-3 + .5p32) "hello" and "hello\\"there"
  286. 123 // some comment
  287. a = b + c;
  288. #define abc \\
  289. 5
  290. d = 5 /* hello */ + 3;
  291. """
  292. t=tokenizer()
  293. t.tokenize_text(text)
  294. print t.tokens