tokenize.py - The code defines a `Tokenizer` class that bre…

/scripts/tokenize.py

http://github.com/bengardner/uncrustify · Python · 315 lines · 255 code · 21 blank · 39 comment · 67 complexity · c70bd9170fea52d208ad2cc302dd4695 MD5 · raw file

#! /usr/bin/env python
# tokenize.py
#
# Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of
# tuples (string, type)
#

# punctuator lookup table
punc_table = [
   [ '!',  25,  26, '!'   ],   #   0: '!'
   [ '#',  24,  35, '#'   ],   #   1: '#'
   [ '$',  23,   0, '$'   ],   #   2: '$'
   [ '%',  22,  36, '%'   ],   #   3: '%'
   [ '&',  21,  41, '&'   ],   #   4: '&'
   [ '(',  20,   0, '('   ],   #   5: '('
   [ ')',  19,   0, ')'   ],   #   6: ')'
   [ '*',  18,  43, '*'   ],   #   7: '*'
   [ '+',  17,  44, '+'   ],   #   8: '+'
   [ ',',  16,   0, ','   ],   #   9: ','
   [ '-',  15,  46, '-'   ],   #  10: '-'
   [ '.',  14,  50, '.'   ],   #  11: '.'
   [ '/',  13,  53, '/'   ],   #  12: '/'
   [ ':',  12,  54, ':'   ],   #  13: ':'
   [ ';',  11,   0, ';'   ],   #  14: ';'
   [ '<',  10,  56, '<'   ],   #  15: '<'
   [ '=',   9,  63, '='   ],   #  16: '='
   [ '>',   8,  65, '>'   ],   #  17: '>'
   [ '?',   7,   0, '?'   ],   #  18: '?'
   [ '[',   6,  70, '['   ],   #  19: '['
   [ ']',   5,   0, ']'   ],   #  20: ']'
   [ '^',   4,  71, '^'   ],   #  21: '^'
   [ '{',   3,   0, '{'   ],   #  22: '{'
   [ '|',   2,  72, '|'   ],   #  23: '|'
   [ '}',   1,   0, '}'   ],   #  24: '}'
   [ '~',   0,  74, '~'   ],   #  25: '~'
   [ '<',   3,  30, '!<'  ],   #  26: '!<'
   [ '=',   2,  33, '!='  ],   #  27: '!='
   [ '>',   1,  34, '!>'  ],   #  28: '!>'
   [ '~',   0,   0, '!~'  ],   #  29: '!~'
   [ '=',   1,   0, '!<=' ],   #  30: '!<='
   [ '>',   0,  32, '!<>' ],   #  31: '!<>'
   [ '=',   0,   0, '!<>='],   #  32: '!<>='
   [ '=',   0,   0, '!==' ],   #  33: '!=='
   [ '=',   0,   0, '!>=' ],   #  34: '!>='
   [ '#',   0,   0, '##'  ],   #  35: '##'
   [ ':',   2,  39, '%:'  ],   #  36: '%:'
   [ '=',   1,   0, '%='  ],   #  37: '%='
   [ '>',   0,   0, '%>'  ],   #  38: '%>'
   [ '%',   0,  40, None  ],   #  39: '%:%'
   [ ':',   0,   0, '%:%:'],   #  40: '%:%:'
   [ '&',   1,   0, '&&'  ],   #  41: '&&'
   [ '=',   0,   0, '&='  ],   #  42: '&='
   [ '=',   0,   0, '*='  ],   #  43: '*='
   [ '+',   1,   0, '++'  ],   #  44: '++'
   [ '=',   0,   0, '+='  ],   #  45: '+='
   [ '-',   2,   0, '--'  ],   #  46: '--'
   [ '=',   1,   0, '-='  ],   #  47: '-='
   [ '>',   0,  49, '->'  ],   #  48: '->'
   [ '*',   0,   0, '->*' ],   #  49: '->*'
   [ '*',   1,   0, '.*'  ],   #  50: '.*'
   [ '.',   0,  52, '..'  ],   #  51: '..'
   [ '.',   0,   0, '...' ],   #  52: '...'
   [ '=',   0,   0, '/='  ],   #  53: '/='
   [ ':',   1,   0, '::'  ],   #  54: '::'
   [ '>',   0,   0, ':>'  ],   #  55: ':>'
   [ '%',   4,   0, '<%'  ],   #  56: '<%'
   [ ':',   3,   0, '<:'  ],   #  57: '<:'
   [ '<',   2,  61, '<<'  ],   #  58: '<<'
   [ '=',   1,   0, '<='  ],   #  59: '<='
   [ '>',   0,  62, '<>'  ],   #  60: '<>'
   [ '=',   0,   0, '<<=' ],   #  61: '<<='
   [ '=',   0,   0, '<>=' ],   #  62: '<>='
   [ '=',   0,  64, '=='  ],   #  63: '=='
   [ '=',   0,   0, '===' ],   #  64: '==='
   [ '=',   1,   0, '>='  ],   #  65: '>='
   [ '>',   0,  67, '>>'  ],   #  66: '>>'
   [ '=',   1,   0, '>>=' ],   #  67: '>>='
   [ '>',   0,  69, '>>>' ],   #  68: '>>>'
   [ '=',   0,   0, '>>>='],   #  69: '>>>='
   [ ']',   0,   0, '[]'  ],   #  70: '[]'
   [ '=',   0,   0, '^='  ],   #  71: '^='
   [ '=',   1,   0, '|='  ],   #  72: '|='
   [ '|',   0,   0, '||'  ],   #  73: '||'
   [ '=',   1,   0, '~='  ],   #  74: '~='
   [ '~',   0,   0, '~~'  ],   #  75: '~~'
]

#
# Token types:
#  0 = newline
#  1 = punctuator
#  2 = integer
#  3 = float
#  4 = string
#  5 = identifier
#

class tokenizer:
	def __init__(self):
		self.tokens   = []
		self.text     = ''
		self.text_idx = 0

	def tokenize_text (self, in_text):
		self.tokens   = []
		self.text     = in_text
		self.text_idx = 0

		print in_text
		try:
			while self.text_idx < len(self.text):
				if self.parse_whitespace():
					continue
				elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n':
					self.text_idx += 2
					continue
				elif self.parse_comment():
					continue
				elif self.parse_number():
					continue
				elif self.parse_identifier():
					continue
				elif self.parse_string():
					continue
				elif self.parse_punctuator():
					continue
				else:
					print 'confused:', self.text[self.text_idx:]
					break
		except:
			print 'bombed'
			raise

	def parse_whitespace(self):
		start_idx = self.text_idx
		hit_newline = False
		while self.text_idx < len(self.text):
			if self.text[self.text_idx] in '\n\r':
				hit_newline = True
			elif not self.text[self.text_idx] in ' \t':
				break
			self.text_idx += 1

		if hit_newline:
			self.tokens.append(('\n', 0))
		return start_idx != self.text_idx

	def parse_comment(self):
		if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*':
			return False
		if self.text[self.text_idx + 1] == '/':
			while self.text_idx < len(self.text):
				if self.text[self.text_idx] in '\n\r':
					break;
				self.text_idx += 1
		else:
			while self.text_idx < len(self.text) - 1:
				if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/':
					self.text_idx += 2
					break;
				self.text_idx += 1
		return True

	def parse_identifier(self):
		if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ':
			return False
		start_idx = self.text_idx
		while self.text_idx < len(self.text) and self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890':
			self.text_idx += 1
		self.tokens.append((self.text[start_idx : self.text_idx], 5))
		return True

	def parse_string(self):
		starter = 0
		start_ch = self.text[self.text_idx]
		if start_ch == 'L':
			starter = 1
			start_ch = self.text[self.text_idx + 1]
		if not start_ch in '"\'':
			return False
		start_idx = self.text_idx
		self.text_idx += starter + 1
		escaped = False
		while self.text_idx < len(self.text):
			if escaped:
				escaped = False
			else:
				if self.text[self.text_idx] == '\\':
					escaped = True
				elif self.text[self.text_idx] == start_ch:
					self.text_idx += 1
					break;
			self.text_idx += 1

		self.tokens.append((self.text[start_idx : self.text_idx], 4))
		return True

	# Checks for punctuators
	# Returns whether a punctuator was consumed (True or False)
	def parse_punctuator(self):
		tab_idx = 0
		punc_len = 0
		saved_punc = None
		while 1:
			pte = punc_table[tab_idx]
			if pte[0] == self.text[self.text_idx]:
				if pte[3] != None:
					saved_punc = pte[3]
				self.text_idx += 1
				tab_idx = pte[2]
				if tab_idx == 0:
					break
			elif pte[1] == 0:
				break
			else:
				tab_idx += 1
		if saved_punc != None:
			self.tokens.append((saved_punc, 1))
			return True
		return False

	def parse_number(self):
		# A number must start with a digit or a dot followed by a digit
		ch = self.text[self.text_idx]
		if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()):
			return False;
		token_type = 2 # integer
		if (ch == '.'):
			token_type = 3 # float
		did_hex  = False;
		start_idx = self.text_idx

		# Check for Hex, Octal, or Binary
		# Note that only D and Pawn support binary, but who cares?
		#
		if ch == '0':
			self.text_idx += 1
			ch = self.text[self.text_idx].upper()
			if ch == 'X':                # hex
				did_hex = True
				self.text_idx += 1
				while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
					self.text_idx += 1
			elif ch == 'B':              # binary
				self.text_idx += 1
				while self.text[self.text_idx] in '_01':
					self.text_idx += 1
			elif ch >= '0' and ch <= 7:  # octal (but allow decimal)
				self.text_idx += 1
				while self.text[self.text_idx] in '_0123456789':
					self.text_idx += 1
			else:
				# either just 0 or 0.1 or 0UL, etc
				pass
		else:
			# Regular int or float
			while self.text[self.text_idx] in '_0123456789':
				self.text_idx += 1

		# Check if we stopped on a decimal point
		if self.text[self.text_idx] == '.':
			self.text_idx += 1
			token_type = 3 # float
			if did_hex:
				while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
					self.text_idx += 1
			else:
				while self.text[self.text_idx] in '_0123456789':
					self.text_idx += 1

		# Check exponent
		# Valid exponents per language (not that it matters):
		# C/C++/D/Java: eEpP
		# C#/Pawn:      eE
		if self.text[self.text_idx] in 'eEpP':
			token_type = 3 # float
			self.text_idx += 1
			if self.text[self.text_idx] in '+-':
				self.text_idx += 1
			while self.text[self.text_idx] in '_0123456789':
				self.text_idx += 1

		# Check the suffixes
		# Valid suffixes per language (not that it matters):
		#        Integer       Float
		# C/C++: uUlL          lLfF
		# C#:    uUlL          fFdDMm
		# D:     uUL           ifFL
		# Java:  lL            fFdD
		# Pawn:  (none)        (none)
		#
		# Note that i, f, d, and m only appear in floats.
		while 1:
			if self.text[self.text_idx] in 'tTfFdDmM':
				token_type = 3 # float
			elif not self.text[self.text_idx] in 'lLuU':
				break;
			self.text_idx += 1

		self.tokens.append((self.text[start_idx : self.text_idx], token_type))
		return True

text = """
1.23+4-3*16%2 *sin(1.e-3 + .5p32) "hello" and "hello\\"there"
123 // some comment
a = b + c;
#define abc \\
        5
d = 5 /* hello */ + 3;
"""

t=tokenizer()
t.tokenize_text(text)
print t.tokens
Summary ✨

The code defines a Tokenizer class that breaks down input text into individual tokens, such as numbers, operators, and keywords. It uses a table-driven approach to match characters with token types, handling various languages’ syntax rules. The output is a list of tuples containing the matched token values and their corresponding types, which can be used for parsing or analyzing the input text.
Alerts (8)

'def' Ensure functions have docstrings for documentation
104 134 148
'try:' Ensure try blocks have corresponding except or finally blocks
110
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
130
Complexity hotspot; lines 156 to 158 (total complexity: 4)
156 157 158