/Lib/lib2to3/pgen2/tokenize.py
Python | 405 lines | 362 code | 20 blank | 23 comment | 19 complexity | 06aea8121aa7b0fc71345d011813d4b4 MD5 | raw file
1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 2# All rights reserved. 3 4"""Tokenization help for Python programs. 5 6generate_tokens(readline) is a generator that breaks a stream of 7text into Python tokens. It accepts a readline-like method which is called 8repeatedly to get the next line of input (or "" for EOF). It generates 95-tuples with these members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators 20 21Older entry points 22 tokenize_loop(readline, tokeneater) 23 tokenize(readline, tokeneater=printtoken) 24are the same, except instead of generating tokens, tokeneater is a callback 25function to which the 5 fields described above are passed as 5 arguments, 26each time a new token is found.""" 27 28__author__ = 'Ka-Ping Yee <ping@lfw.org>' 29__credits__ = \ 30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' 31 32import string, re 33from lib2to3.pgen2.token import * 34 35from . import token 36__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize", 37 "generate_tokens", "untokenize"] 38del token 39 40def group(*choices): return '(' + '|'.join(choices) + ')' 41def any(*choices): return group(*choices) + '*' 42def maybe(*choices): return group(*choices) + '?' 43 44Whitespace = r'[ \f\t]*' 45Comment = r'#[^\r\n]*' 46Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 47Name = r'[a-zA-Z_]\w*' 48 49Binnumber = r'0[bB][01]*' 50Hexnumber = r'0[xX][\da-fA-F]*[lL]?' 51Octnumber = r'0[oO]?[0-7]*[lL]?' 52Decnumber = r'[1-9]\d*[lL]?' 53Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) 54Exponent = r'[eE][-+]?\d+' 55Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 56Expfloat = r'\d+' + Exponent 57Floatnumber = group(Pointfloat, Expfloat) 58Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 59Number = group(Imagnumber, Floatnumber, Intnumber) 60 61# Tail end of ' string. 62Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 63# Tail end of " string. 64Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 65# Tail end of ''' string. 66Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 67# Tail end of """ string. 68Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 69Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""') 70# Single-line ' or " string. 71String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 72 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 73 74# Because of leftmost-then-longest match semantics, be sure to put the 75# longest operators first (e.g., if = came before ==, == would get 76# recognized as two instances of =). 77Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 78 r"//=?", r"->", 79 r"[+\-*/%&|^=<>]=?", 80 r"~") 81 82Bracket = '[][(){}]' 83Special = group(r'\r?\n', r'[:;.,`@]') 84Funny = group(Operator, Bracket, Special) 85 86PlainToken = group(Number, Funny, String, Name) 87Token = Ignore + PlainToken 88 89# First (or only) line of ' or " string. 90ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 91 group("'", r'\\\r?\n'), 92 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 93 group('"', r'\\\r?\n')) 94PseudoExtras = group(r'\\\r?\n', Comment, Triple) 95PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 96 97tokenprog, pseudoprog, single3prog, double3prog = map( 98 re.compile, (Token, PseudoToken, Single3, Double3)) 99endprogs = {"'": re.compile(Single), '"': re.compile(Double), 100 "'''": single3prog, '"""': double3prog, 101 "r'''": single3prog, 'r"""': double3prog, 102 "u'''": single3prog, 'u"""': double3prog, 103 "b'''": single3prog, 'b"""': double3prog, 104 "ur'''": single3prog, 'ur"""': double3prog, 105 "br'''": single3prog, 'br"""': double3prog, 106 "R'''": single3prog, 'R"""': double3prog, 107 "U'''": single3prog, 'U"""': double3prog, 108 "B'''": single3prog, 'B"""': double3prog, 109 "uR'''": single3prog, 'uR"""': double3prog, 110 "Ur'''": single3prog, 'Ur"""': double3prog, 111 "UR'''": single3prog, 'UR"""': double3prog, 112 "bR'''": single3prog, 'bR"""': double3prog, 113 "Br'''": single3prog, 'Br"""': double3prog, 114 "BR'''": single3prog, 'BR"""': double3prog, 115 'r': None, 'R': None, 116 'u': None, 'U': None, 117 'b': None, 'B': None} 118 119triple_quoted = {} 120for t in ("'''", '"""', 121 "r'''", 'r"""', "R'''", 'R"""', 122 "u'''", 'u"""', "U'''", 'U"""', 123 "b'''", 'b"""', "B'''", 'B"""', 124 "ur'''", 'ur"""', "Ur'''", 'Ur"""', 125 "uR'''", 'uR"""', "UR'''", 'UR"""', 126 "br'''", 'br"""', "Br'''", 'Br"""', 127 "bR'''", 'bR"""', "BR'''", 'BR"""',): 128 triple_quoted[t] = t 129single_quoted = {} 130for t in ("'", '"', 131 "r'", 'r"', "R'", 'R"', 132 "u'", 'u"', "U'", 'U"', 133 "b'", 'b"', "B'", 'B"', 134 "ur'", 'ur"', "Ur'", 'Ur"', 135 "uR'", 'uR"', "UR'", 'UR"', 136 "br'", 'br"', "Br'", 'Br"', 137 "bR'", 'bR"', "BR'", 'BR"', ): 138 single_quoted[t] = t 139 140tabsize = 8 141 142class TokenError(Exception): pass 143 144class StopTokenizing(Exception): pass 145 146def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing 147 print "%d,%d-%d,%d:\t%s\t%s" % \ 148 (srow, scol, erow, ecol, tok_name[type], repr(token)) 149 150def tokenize(readline, tokeneater=printtoken): 151 """ 152 The tokenize() function accepts two parameters: one representing the 153 input stream, and one providing an output mechanism for tokenize(). 154 155 The first parameter, readline, must be a callable object which provides 156 the same interface as the readline() method of built-in file objects. 157 Each call to the function should return one line of input as a string. 158 159 The second parameter, tokeneater, must also be a callable object. It is 160 called once for each token, with five arguments, corresponding to the 161 tuples generated by generate_tokens(). 162 """ 163 try: 164 tokenize_loop(readline, tokeneater) 165 except StopTokenizing: 166 pass 167 168# backwards compatible interface 169def tokenize_loop(readline, tokeneater): 170 for token_info in generate_tokens(readline): 171 tokeneater(*token_info) 172 173class Untokenizer: 174 175 def __init__(self): 176 self.tokens = [] 177 self.prev_row = 1 178 self.prev_col = 0 179 180 def add_whitespace(self, start): 181 row, col = start 182 assert row <= self.prev_row 183 col_offset = col - self.prev_col 184 if col_offset: 185 self.tokens.append(" " * col_offset) 186 187 def untokenize(self, iterable): 188 for t in iterable: 189 if len(t) == 2: 190 self.compat(t, iterable) 191 break 192 tok_type, token, start, end, line = t 193 self.add_whitespace(start) 194 self.tokens.append(token) 195 self.prev_row, self.prev_col = end 196 if tok_type in (NEWLINE, NL): 197 self.prev_row += 1 198 self.prev_col = 0 199 return "".join(self.tokens) 200 201 def compat(self, token, iterable): 202 startline = False 203 indents = [] 204 toks_append = self.tokens.append 205 toknum, tokval = token 206 if toknum in (NAME, NUMBER): 207 tokval += ' ' 208 if toknum in (NEWLINE, NL): 209 startline = True 210 for tok in iterable: 211 toknum, tokval = tok[:2] 212 213 if toknum in (NAME, NUMBER): 214 tokval += ' ' 215 216 if toknum == INDENT: 217 indents.append(tokval) 218 continue 219 elif toknum == DEDENT: 220 indents.pop() 221 continue 222 elif toknum in (NEWLINE, NL): 223 startline = True 224 elif startline and indents: 225 toks_append(indents[-1]) 226 startline = False 227 toks_append(tokval) 228 229def untokenize(iterable): 230 """Transform tokens back into Python source code. 231 232 Each element returned by the iterable must be a token sequence 233 with at least two elements, a token number and token value. If 234 only two tokens are passed, the resulting output is poor. 235 236 Round-trip invariant for full input: 237 Untokenized source will match input source exactly 238 239 Round-trip invariant for limited intput: 240 # Output text will tokenize the back to the input 241 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 242 newcode = untokenize(t1) 243 readline = iter(newcode.splitlines(1)).next 244 t2 = [tok[:2] for tokin generate_tokens(readline)] 245 assert t1 == t2 246 """ 247 ut = Untokenizer() 248 return ut.untokenize(iterable) 249 250def generate_tokens(readline): 251 """ 252 The generate_tokens() generator requires one argment, readline, which 253 must be a callable object which provides the same interface as the 254 readline() method of built-in file objects. Each call to the function 255 should return one line of input as a string. Alternately, readline 256 can be a callable function terminating with StopIteration: 257 readline = open(myfile).next # Example of alternate readline 258 259 The generator produces 5-tuples with these members: the token type; the 260 token string; a 2-tuple (srow, scol) of ints specifying the row and 261 column where the token begins in the source; a 2-tuple (erow, ecol) of 262 ints specifying the row and column where the token ends in the source; 263 and the line on which the token was found. The line passed is the 264 logical line; continuation lines are included. 265 """ 266 lnum = parenlev = continued = 0 267 namechars, numchars = string.ascii_letters + '_', '0123456789' 268 contstr, needcont = '', 0 269 contline = None 270 indents = [0] 271 272 while 1: # loop over lines in stream 273 try: 274 line = readline() 275 except StopIteration: 276 line = '' 277 lnum = lnum + 1 278 pos, max = 0, len(line) 279 280 if contstr: # continued string 281 if not line: 282 raise TokenError, ("EOF in multi-line string", strstart) 283 endmatch = endprog.match(line) 284 if endmatch: 285 pos = end = endmatch.end(0) 286 yield (STRING, contstr + line[:end], 287 strstart, (lnum, end), contline + line) 288 contstr, needcont = '', 0 289 contline = None 290 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 291 yield (ERRORTOKEN, contstr + line, 292 strstart, (lnum, len(line)), contline) 293 contstr = '' 294 contline = None 295 continue 296 else: 297 contstr = contstr + line 298 contline = contline + line 299 continue 300 301 elif parenlev == 0 and not continued: # new statement 302 if not line: break 303 column = 0 304 while pos < max: # measure leading whitespace 305 if line[pos] == ' ': column = column + 1 306 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize 307 elif line[pos] == '\f': column = 0 308 else: break 309 pos = pos + 1 310 if pos == max: break 311 312 if line[pos] in '#\r\n': # skip comments or blank lines 313 if line[pos] == '#': 314 comment_token = line[pos:].rstrip('\r\n') 315 nl_pos = pos + len(comment_token) 316 yield (COMMENT, comment_token, 317 (lnum, pos), (lnum, pos + len(comment_token)), line) 318 yield (NL, line[nl_pos:], 319 (lnum, nl_pos), (lnum, len(line)), line) 320 else: 321 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 322 (lnum, pos), (lnum, len(line)), line) 323 continue 324 325 if column > indents[-1]: # count indents or dedents 326 indents.append(column) 327 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 328 while column < indents[-1]: 329 if column not in indents: 330 raise IndentationError( 331 "unindent does not match any outer indentation level", 332 ("<tokenize>", lnum, pos, line)) 333 indents = indents[:-1] 334 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 335 336 else: # continued statement 337 if not line: 338 raise TokenError, ("EOF in multi-line statement", (lnum, 0)) 339 continued = 0 340 341 while pos < max: 342 pseudomatch = pseudoprog.match(line, pos) 343 if pseudomatch: # scan for tokens 344 start, end = pseudomatch.span(1) 345 spos, epos, pos = (lnum, start), (lnum, end), end 346 token, initial = line[start:end], line[start] 347 348 if initial in numchars or \ 349 (initial == '.' and token != '.'): # ordinary number 350 yield (NUMBER, token, spos, epos, line) 351 elif initial in '\r\n': 352 newline = NEWLINE 353 if parenlev > 0: 354 newline = NL 355 yield (newline, token, spos, epos, line) 356 elif initial == '#': 357 assert not token.endswith("\n") 358 yield (COMMENT, token, spos, epos, line) 359 elif token in triple_quoted: 360 endprog = endprogs[token] 361 endmatch = endprog.match(line, pos) 362 if endmatch: # all on one line 363 pos = endmatch.end(0) 364 token = line[start:pos] 365 yield (STRING, token, spos, (lnum, pos), line) 366 else: 367 strstart = (lnum, start) # multiple lines 368 contstr = line[start:] 369 contline = line 370 break 371 elif initial in single_quoted or \ 372 token[:2] in single_quoted or \ 373 token[:3] in single_quoted: 374 if token[-1] == '\n': # continued string 375 strstart = (lnum, start) 376 endprog = (endprogs[initial] or endprogs[token[1]] or 377 endprogs[token[2]]) 378 contstr, needcont = line[start:], 1 379 contline = line 380 break 381 else: # ordinary string 382 yield (STRING, token, spos, epos, line) 383 elif initial in namechars: # ordinary name 384 yield (NAME, token, spos, epos, line) 385 elif initial == '\\': # continued stmt 386 # This yield is new; needed for better idempotency: 387 yield (NL, token, spos, (lnum, pos), line) 388 continued = 1 389 else: 390 if initial in '([{': parenlev = parenlev + 1 391 elif initial in ')]}': parenlev = parenlev - 1 392 yield (OP, token, spos, epos, line) 393 else: 394 yield (ERRORTOKEN, line[pos], 395 (lnum, pos), (lnum, pos+1), line) 396 pos = pos + 1 397 398 for indent in indents[1:]: # pop remaining indent levels 399 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 400 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 401 402if __name__ == '__main__': # testing 403 import sys 404 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) 405 else: tokenize(sys.stdin.readline)