/scripts/tokenize.py
Python | 315 lines | 291 code | 8 blank | 16 comment | 29 complexity | c70bd9170fea52d208ad2cc302dd4695 MD5 | raw file
Possible License(s): GPL-2.0
- #! /usr/bin/env python
- # tokenize.py
- #
- # Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of
- # tuples (string, type)
- #
- # punctuator lookup table
- punc_table = [
- [ '!', 25, 26, '!' ], # 0: '!'
- [ '#', 24, 35, '#' ], # 1: '#'
- [ '$', 23, 0, '$' ], # 2: '$'
- [ '%', 22, 36, '%' ], # 3: '%'
- [ '&', 21, 41, '&' ], # 4: '&'
- [ '(', 20, 0, '(' ], # 5: '('
- [ ')', 19, 0, ')' ], # 6: ')'
- [ '*', 18, 43, '*' ], # 7: '*'
- [ '+', 17, 44, '+' ], # 8: '+'
- [ ',', 16, 0, ',' ], # 9: ','
- [ '-', 15, 46, '-' ], # 10: '-'
- [ '.', 14, 50, '.' ], # 11: '.'
- [ '/', 13, 53, '/' ], # 12: '/'
- [ ':', 12, 54, ':' ], # 13: ':'
- [ ';', 11, 0, ';' ], # 14: ';'
- [ '<', 10, 56, '<' ], # 15: '<'
- [ '=', 9, 63, '=' ], # 16: '='
- [ '>', 8, 65, '>' ], # 17: '>'
- [ '?', 7, 0, '?' ], # 18: '?'
- [ '[', 6, 70, '[' ], # 19: '['
- [ ']', 5, 0, ']' ], # 20: ']'
- [ '^', 4, 71, '^' ], # 21: '^'
- [ '{', 3, 0, '{' ], # 22: '{'
- [ '|', 2, 72, '|' ], # 23: '|'
- [ '}', 1, 0, '}' ], # 24: '}'
- [ '~', 0, 74, '~' ], # 25: '~'
- [ '<', 3, 30, '!<' ], # 26: '!<'
- [ '=', 2, 33, '!=' ], # 27: '!='
- [ '>', 1, 34, '!>' ], # 28: '!>'
- [ '~', 0, 0, '!~' ], # 29: '!~'
- [ '=', 1, 0, '!<=' ], # 30: '!<='
- [ '>', 0, 32, '!<>' ], # 31: '!<>'
- [ '=', 0, 0, '!<>='], # 32: '!<>='
- [ '=', 0, 0, '!==' ], # 33: '!=='
- [ '=', 0, 0, '!>=' ], # 34: '!>='
- [ '#', 0, 0, '##' ], # 35: '##'
- [ ':', 2, 39, '%:' ], # 36: '%:'
- [ '=', 1, 0, '%=' ], # 37: '%='
- [ '>', 0, 0, '%>' ], # 38: '%>'
- [ '%', 0, 40, None ], # 39: '%:%'
- [ ':', 0, 0, '%:%:'], # 40: '%:%:'
- [ '&', 1, 0, '&&' ], # 41: '&&'
- [ '=', 0, 0, '&=' ], # 42: '&='
- [ '=', 0, 0, '*=' ], # 43: '*='
- [ '+', 1, 0, '++' ], # 44: '++'
- [ '=', 0, 0, '+=' ], # 45: '+='
- [ '-', 2, 0, '--' ], # 46: '--'
- [ '=', 1, 0, '-=' ], # 47: '-='
- [ '>', 0, 49, '->' ], # 48: '->'
- [ '*', 0, 0, '->*' ], # 49: '->*'
- [ '*', 1, 0, '.*' ], # 50: '.*'
- [ '.', 0, 52, '..' ], # 51: '..'
- [ '.', 0, 0, '...' ], # 52: '...'
- [ '=', 0, 0, '/=' ], # 53: '/='
- [ ':', 1, 0, '::' ], # 54: '::'
- [ '>', 0, 0, ':>' ], # 55: ':>'
- [ '%', 4, 0, '<%' ], # 56: '<%'
- [ ':', 3, 0, '<:' ], # 57: '<:'
- [ '<', 2, 61, '<<' ], # 58: '<<'
- [ '=', 1, 0, '<=' ], # 59: '<='
- [ '>', 0, 62, '<>' ], # 60: '<>'
- [ '=', 0, 0, '<<=' ], # 61: '<<='
- [ '=', 0, 0, '<>=' ], # 62: '<>='
- [ '=', 0, 64, '==' ], # 63: '=='
- [ '=', 0, 0, '===' ], # 64: '==='
- [ '=', 1, 0, '>=' ], # 65: '>='
- [ '>', 0, 67, '>>' ], # 66: '>>'
- [ '=', 1, 0, '>>=' ], # 67: '>>='
- [ '>', 0, 69, '>>>' ], # 68: '>>>'
- [ '=', 0, 0, '>>>='], # 69: '>>>='
- [ ']', 0, 0, '[]' ], # 70: '[]'
- [ '=', 0, 0, '^=' ], # 71: '^='
- [ '=', 1, 0, '|=' ], # 72: '|='
- [ '|', 0, 0, '||' ], # 73: '||'
- [ '=', 1, 0, '~=' ], # 74: '~='
- [ '~', 0, 0, '~~' ], # 75: '~~'
- ]
- #
- # Token types:
- # 0 = newline
- # 1 = punctuator
- # 2 = integer
- # 3 = float
- # 4 = string
- # 5 = identifier
- #
- class tokenizer:
- def __init__(self):
- self.tokens = []
- self.text = ''
- self.text_idx = 0
- def tokenize_text (self, in_text):
- self.tokens = []
- self.text = in_text
- self.text_idx = 0
- print in_text
- try:
- while self.text_idx < len(self.text):
- if self.parse_whitespace():
- continue
- elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n':
- self.text_idx += 2
- continue
- elif self.parse_comment():
- continue
- elif self.parse_number():
- continue
- elif self.parse_identifier():
- continue
- elif self.parse_string():
- continue
- elif self.parse_punctuator():
- continue
- else:
- print 'confused:', self.text[self.text_idx:]
- break
- except:
- print 'bombed'
- raise
- def parse_whitespace(self):
- start_idx = self.text_idx
- hit_newline = False
- while self.text_idx < len(self.text):
- if self.text[self.text_idx] in '\n\r':
- hit_newline = True
- elif not self.text[self.text_idx] in ' \t':
- break
- self.text_idx += 1
- if hit_newline:
- self.tokens.append(('\n', 0))
- return start_idx != self.text_idx
- def parse_comment(self):
- if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*':
- return False
- if self.text[self.text_idx + 1] == '/':
- while self.text_idx < len(self.text):
- if self.text[self.text_idx] in '\n\r':
- break;
- self.text_idx += 1
- else:
- while self.text_idx < len(self.text) - 1:
- if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/':
- self.text_idx += 2
- break;
- self.text_idx += 1
- return True
- def parse_identifier(self):
- if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ':
- return False
- start_idx = self.text_idx
- while self.text_idx < len(self.text) and self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890':
- self.text_idx += 1
- self.tokens.append((self.text[start_idx : self.text_idx], 5))
- return True
- def parse_string(self):
- starter = 0
- start_ch = self.text[self.text_idx]
- if start_ch == 'L':
- starter = 1
- start_ch = self.text[self.text_idx + 1]
- if not start_ch in '"\'':
- return False
- start_idx = self.text_idx
- self.text_idx += starter + 1
- escaped = False
- while self.text_idx < len(self.text):
- if escaped:
- escaped = False
- else:
- if self.text[self.text_idx] == '\\':
- escaped = True
- elif self.text[self.text_idx] == start_ch:
- self.text_idx += 1
- break;
- self.text_idx += 1
- self.tokens.append((self.text[start_idx : self.text_idx], 4))
- return True
- # Checks for punctuators
- # Returns whether a punctuator was consumed (True or False)
- def parse_punctuator(self):
- tab_idx = 0
- punc_len = 0
- saved_punc = None
- while 1:
- pte = punc_table[tab_idx]
- if pte[0] == self.text[self.text_idx]:
- if pte[3] != None:
- saved_punc = pte[3]
- self.text_idx += 1
- tab_idx = pte[2]
- if tab_idx == 0:
- break
- elif pte[1] == 0:
- break
- else:
- tab_idx += 1
- if saved_punc != None:
- self.tokens.append((saved_punc, 1))
- return True
- return False
- def parse_number(self):
- # A number must start with a digit or a dot followed by a digit
- ch = self.text[self.text_idx]
- if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()):
- return False;
- token_type = 2 # integer
- if (ch == '.'):
- token_type = 3 # float
- did_hex = False;
- start_idx = self.text_idx
- # Check for Hex, Octal, or Binary
- # Note that only D and Pawn support binary, but who cares?
- #
- if ch == '0':
- self.text_idx += 1
- ch = self.text[self.text_idx].upper()
- if ch == 'X': # hex
- did_hex = True
- self.text_idx += 1
- while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
- self.text_idx += 1
- elif ch == 'B': # binary
- self.text_idx += 1
- while self.text[self.text_idx] in '_01':
- self.text_idx += 1
- elif ch >= '0' and ch <= 7: # octal (but allow decimal)
- self.text_idx += 1
- while self.text[self.text_idx] in '_0123456789':
- self.text_idx += 1
- else:
- # either just 0 or 0.1 or 0UL, etc
- pass
- else:
- # Regular int or float
- while self.text[self.text_idx] in '_0123456789':
- self.text_idx += 1
- # Check if we stopped on a decimal point
- if self.text[self.text_idx] == '.':
- self.text_idx += 1
- token_type = 3 # float
- if did_hex:
- while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
- self.text_idx += 1
- else:
- while self.text[self.text_idx] in '_0123456789':
- self.text_idx += 1
- # Check exponent
- # Valid exponents per language (not that it matters):
- # C/C++/D/Java: eEpP
- # C#/Pawn: eE
- if self.text[self.text_idx] in 'eEpP':
- token_type = 3 # float
- self.text_idx += 1
- if self.text[self.text_idx] in '+-':
- self.text_idx += 1
- while self.text[self.text_idx] in '_0123456789':
- self.text_idx += 1
- # Check the suffixes
- # Valid suffixes per language (not that it matters):
- # Integer Float
- # C/C++: uUlL lLfF
- # C#: uUlL fFdDMm
- # D: uUL ifFL
- # Java: lL fFdD
- # Pawn: (none) (none)
- #
- # Note that i, f, d, and m only appear in floats.
- while 1:
- if self.text[self.text_idx] in 'tTfFdDmM':
- token_type = 3 # float
- elif not self.text[self.text_idx] in 'lLuU':
- break;
- self.text_idx += 1
- self.tokens.append((self.text[start_idx : self.text_idx], token_type))
- return True
- text = """
- 1.23+4-3*16%2 *sin(1.e-3 + .5p32) "hello" and "hello\\"there"
- 123 // some comment
- a = b + c;
- #define abc \\
- 5
- d = 5 /* hello */ + 3;
- """
- t=tokenizer()
- t.tokenize_text(text)
- print t.tokens